[openssl/f17] new upstream release fixing multiple CVEs

Tomáš Mráz tmraz at fedoraproject.org
Tue Feb 19 20:57:11 UTC 2013


commit 36472b541d49ddaa5488d37aaf4cba1b45180e25
Author: Tomas Mraz <tmraz at fedoraproject.org>
Date:   Tue Feb 19 21:57:05 2013 +0100

    new upstream release fixing multiple CVEs

 .gitignore                                         |    1 +
 openssl-0.9.8j-env-nozlib.patch                    |   13 -
 openssl-1.0.0j-version.patch                       |   21 -
 openssl-1.0.0k-backports.patch                     |  775 ++
 ...-1.0.0f-fips.patch => openssl-1.0.0k-fips.patch | 1233 ++--
 ...telopts.patch => openssl-1.0.0k-intelopts.patch | 9656 ++++++++++----------
 openssl-1.0.0k-secure-getenv.patch                 |  154 +
 openssl-1.0.0k-version.patch                       |   21 +
 openssl-1.0.1e-env-zlib.patch                      |   29 +
 openssl.spec                                       |   21 +-
 sources                                            |    2 +-
 11 files changed, 6438 insertions(+), 5488 deletions(-)
---
diff --git a/.gitignore b/.gitignore
index 47c7de5..59ed437 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ openssl-1.0.0a-usa.tar.bz2
 /openssl-1.0.0h-usa.tar.xz
 /openssl-1.0.0i-usa.tar.xz
 /openssl-1.0.0j-usa.tar.xz
+/openssl-1.0.0k-usa.tar.xz
diff --git a/openssl-1.0.0k-backports.patch b/openssl-1.0.0k-backports.patch
new file mode 100644
index 0000000..05661e9
--- /dev/null
+++ b/openssl-1.0.0k-backports.patch
@@ -0,0 +1,775 @@
+diff --git a/doc/crypto/X509_STORE_CTX_get_error.pod b/doc/crypto/X509_STORE_CTX_get_error.pod
+index a883f6c..60e8332 100644
+--- a/doc/crypto/X509_STORE_CTX_get_error.pod
++++ b/doc/crypto/X509_STORE_CTX_get_error.pod
+@@ -278,6 +278,8 @@ happen if extended CRL checking is enabled.
+ an application specific error. This will never be returned unless explicitly
+ set by an application.
+ 
++=back
++
+ =head1 NOTES
+ 
+ The above functions should be used instead of directly referencing the fields
+diff --git a/doc/ssl/SSL_CTX_set_client_CA_list.pod b/doc/ssl/SSL_CTX_set_client_CA_list.pod
+index 632b556..5e66133 100644
+--- a/doc/ssl/SSL_CTX_set_client_CA_list.pod
++++ b/doc/ssl/SSL_CTX_set_client_CA_list.pod
+@@ -66,16 +66,16 @@ values:
+ 
+ =over 4
+ 
+-=item 1
+-
+-The operation succeeded.
+-
+ =item 0
+ 
+ A failure while manipulating the STACK_OF(X509_NAME) object occurred or
+ the X509_NAME could not be extracted from B<cacert>. Check the error stack
+ to find out the reason.
+ 
++=item 1
++
++The operation succeeded.
++
+ =back
+ 
+ =head1 EXAMPLES
+diff --git a/doc/ssl/SSL_CTX_use_psk_identity_hint.pod b/doc/ssl/SSL_CTX_use_psk_identity_hint.pod
+index b80e25b..7e60df5 100644
+--- a/doc/ssl/SSL_CTX_use_psk_identity_hint.pod
++++ b/doc/ssl/SSL_CTX_use_psk_identity_hint.pod
+@@ -81,6 +81,8 @@ SSL_CTX_use_psk_identity_hint() and SSL_use_psk_identity_hint() return
+ 
+ Return values from the server callback are interpreted as follows:
+ 
++=over 4
++
+ =item > 0
+ 
+ PSK identity was found and the server callback has provided the PSK
+@@ -99,4 +101,6 @@ completely.
+ PSK identity was not found. An "unknown_psk_identity" alert message
+ will be sent and the connection setup fails.
+ 
++=back
++
+ =cut
+diff --git a/doc/ssl/SSL_accept.pod b/doc/ssl/SSL_accept.pod
+index cc724c0..b1c34d1 100644
+--- a/doc/ssl/SSL_accept.pod
++++ b/doc/ssl/SSL_accept.pod
+@@ -44,17 +44,17 @@ The following return values can occur:
+ 
+ =over 4
+ 
+-=item 1
+-
+-The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
+-established.
+-
+ =item 0
+ 
+ The TLS/SSL handshake was not successful but was shut down controlled and
+ by the specifications of the TLS/SSL protocol. Call SSL_get_error() with the
+ return value B<ret> to find out the reason.
+ 
++=item 1
++
++The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
++established.
++
+ =item E<lt>0
+ 
+ The TLS/SSL handshake was not successful because a fatal error occurred either
+diff --git a/doc/ssl/SSL_connect.pod b/doc/ssl/SSL_connect.pod
+index cc56ebb..946ca89 100644
+--- a/doc/ssl/SSL_connect.pod
++++ b/doc/ssl/SSL_connect.pod
+@@ -41,17 +41,17 @@ The following return values can occur:
+ 
+ =over 4
+ 
+-=item 1
+-
+-The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
+-established.
+-
+ =item 0
+ 
+ The TLS/SSL handshake was not successful but was shut down controlled and
+ by the specifications of the TLS/SSL protocol. Call SSL_get_error() with the
+ return value B<ret> to find out the reason.
+ 
++=item 1
++
++The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
++established.
++
+ =item E<lt>0
+ 
+ The TLS/SSL handshake was not successful, because a fatal error occurred either
+diff --git a/doc/ssl/SSL_do_handshake.pod b/doc/ssl/SSL_do_handshake.pod
+index 2435764..7f8cf24 100644
+--- a/doc/ssl/SSL_do_handshake.pod
++++ b/doc/ssl/SSL_do_handshake.pod
+@@ -45,17 +45,17 @@ The following return values can occur:
+ 
+ =over 4
+ 
+-=item 1
+-
+-The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
+-established.
+-
+ =item 0
+ 
+ The TLS/SSL handshake was not successful but was shut down controlled and
+ by the specifications of the TLS/SSL protocol. Call SSL_get_error() with the
+ return value B<ret> to find out the reason.
+ 
++=item 1
++
++The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
++established.
++
+ =item E<lt>0
+ 
+ The TLS/SSL handshake was not successful because a fatal error occurred either
+diff --git a/doc/ssl/SSL_shutdown.pod b/doc/ssl/SSL_shutdown.pod
+index 89911ac..42a89b7 100644
+--- a/doc/ssl/SSL_shutdown.pod
++++ b/doc/ssl/SSL_shutdown.pod
+@@ -92,11 +92,6 @@ The following return values can occur:
+ 
+ =over 4
+ 
+-=item 1
+-
+-The shutdown was successfully completed. The "close notify" alert was sent
+-and the peer's "close notify" alert was received.
+-
+ =item 0
+ 
+ The shutdown is not yet finished. Call SSL_shutdown() for a second time,
+@@ -104,6 +99,11 @@ if a bidirectional shutdown shall be performed.
+ The output of L<SSL_get_error(3)|SSL_get_error(3)> may be misleading, as an
+ erroneous SSL_ERROR_SYSCALL may be flagged even though no error occurred.
+ 
++=item 1
++
++The shutdown was successfully completed. The "close notify" alert was sent
++and the peer's "close notify" alert was received.
++
+ =item -1
+ 
+ The shutdown was not successful because a fatal error occurred either
+diff --git a/ssl/d1_pkt.c b/ssl/d1_pkt.c
+index 3c81786..9b013e4 100644
+--- a/ssl/d1_pkt.c
++++ b/ssl/d1_pkt.c
+@@ -371,7 +371,7 @@ dtls1_process_record(SSL *s)
+ 	int enc_err;
+ 	SSL_SESSION *sess;
+ 	SSL3_RECORD *rr;
+-	unsigned int mac_size;
++	unsigned int mac_size, orig_len;
+ 	unsigned char md[EVP_MAX_MD_SIZE];
+ 
+ 	rr= &(s->s3->rrec);
+@@ -402,7 +402,6 @@ dtls1_process_record(SSL *s)
+ 
+ 	/* decrypt in place in 'rr->input' */
+ 	rr->data=rr->input;
+-	rr->orig_len=rr->length;
+ 
+ 	enc_err = s->method->ssl3_enc->enc(s,0);
+ 	/* enc_err is:
+@@ -434,15 +433,18 @@ printf("\n");
+ 		mac_size=EVP_MD_CTX_size(s->read_hash);
+ 		OPENSSL_assert(mac_size <= EVP_MAX_MD_SIZE);
+ 
++		/* kludge: *_cbc_remove_padding passes padding length in rr->type */
++		orig_len = rr->length+((unsigned int)rr->type>>8);
++
+ 		/* orig_len is the length of the record before any padding was
+ 		 * removed. This is public information, as is the MAC in use,
+ 		 * therefore we can safely process the record in a different
+ 		 * amount of time if it's too short to possibly contain a MAC.
+ 		 */
+-		if (rr->orig_len < mac_size ||
++		if (orig_len < mac_size ||
+ 		    /* CBC records must have a padding length byte too. */
+ 		    (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+-		     rr->orig_len < mac_size+1))
++		     orig_len < mac_size+1))
+ 			{
+ 			al=SSL_AD_DECODE_ERROR;
+ 			SSLerr(SSL_F_DTLS1_PROCESS_RECORD,SSL_R_LENGTH_TOO_SHORT);
+@@ -457,12 +459,12 @@ printf("\n");
+ 			 * without leaking the contents of the padding bytes.
+ 			 * */
+ 			mac = mac_tmp;
+-			ssl3_cbc_copy_mac(mac_tmp, rr, mac_size);
++			ssl3_cbc_copy_mac(mac_tmp, rr, mac_size, orig_len);
+ 			rr->length -= mac_size;
+ 			}
+ 		else
+ 			{
+-			/* In this case there's no padding, so |rec->orig_len|
++			/* In this case there's no padding, so |orig_len|
+ 			 * equals |rec->length| and we checked that there's
+ 			 * enough bytes for |mac_size| above. */
+ 			rr->length -= mac_size;
+diff --git a/ssl/s3_cbc.c b/ssl/s3_cbc.c
+index dc3fd3e..61413b8 100644
+--- a/ssl/s3_cbc.c
++++ b/ssl/s3_cbc.c
+@@ -76,6 +76,13 @@
+ #define DUPLICATE_MSB_TO_ALL(x) ( (unsigned)( (int)(x) >> (sizeof(int)*8-1) ) )
+ #define DUPLICATE_MSB_TO_ALL_8(x) ((unsigned char)(DUPLICATE_MSB_TO_ALL(x)))
+ 
++/* constant_time_lt returns 0xff if a<b and 0x00 otherwise. */
++static unsigned constant_time_lt(unsigned a, unsigned b)
++	{
++	a -= b;
++	return DUPLICATE_MSB_TO_ALL(a);
++	}
++
+ /* constant_time_ge returns 0xff if a>=b and 0x00 otherwise. */
+ static unsigned constant_time_ge(unsigned a, unsigned b)
+ 	{
+@@ -84,7 +91,7 @@ static unsigned constant_time_ge(unsigned a, unsigned b)
+ 	}
+ 
+ /* constant_time_eq_8 returns 0xff if a==b and 0x00 otherwise. */
+-static unsigned char constant_time_eq_8(unsigned char a, unsigned char b)
++static unsigned char constant_time_eq_8(unsigned a, unsigned b)
+ 	{
+ 	unsigned c = a ^ b;
+ 	c--;
+@@ -116,7 +123,9 @@ int ssl3_cbc_remove_padding(const SSL* s,
+ 	good = constant_time_ge(rec->length, padding_length+overhead);
+ 	/* SSLv3 requires that the padding is minimal. */
+ 	good &= constant_time_ge(block_size, padding_length+1);
+-	rec->length -= good & (padding_length+1);
++	padding_length = good & (padding_length+1);
++	rec->length -= padding_length;
++	rec->type |= padding_length<<8;	/* kludge: pass padding length */
+ 	return (int)((good & 1) | (~good & -1));
+ }
+ 
+@@ -137,14 +146,21 @@ int tls1_cbc_remove_padding(const SSL* s,
+ 			    unsigned mac_size)
+ 	{
+ 	unsigned padding_length, good, to_check, i;
+-	const char has_explicit_iv = s->version == DTLS1_VERSION;
+-	const unsigned overhead = 1 /* padding length byte */ +
+-				  mac_size +
+-				  (has_explicit_iv ? block_size : 0);
+-
+-	/* These lengths are all public so we can test them in non-constant
+-	 * time. */
+-	if (overhead > rec->length)
++	const unsigned overhead = 1 /* padding length byte */ + mac_size;
++	/* Check if version requires explicit IV */
++	if (s->version == DTLS1_VERSION || s->version == DTLS1_BAD_VER)
++		{
++		/* These lengths are all public so we can test them in
++		 * non-constant time.
++		 */
++		if (overhead + block_size > rec->length)
++			return 0;
++		/* We can now safely skip explicit IV */
++		rec->data += block_size;
++		rec->input += block_size;
++		rec->length -= block_size;
++		}
++	else if (overhead > rec->length)
+ 		return 0;
+ 
+ 	padding_length = rec->data[rec->length-1];
+@@ -202,31 +218,13 @@ int tls1_cbc_remove_padding(const SSL* s,
+ 	good <<= sizeof(good)*8-1;
+ 	good = DUPLICATE_MSB_TO_ALL(good);
+ 
+-	rec->length -= good & (padding_length+1);
+-
+-	/* We can always safely skip the explicit IV. We check at the beginning
+-	 * of this function that the record has at least enough space for the
+-	 * IV, MAC and padding length byte. (These can be checked in
+-	 * non-constant time because it's all public information.) So, if the
+-	 * padding was invalid, then we didn't change |rec->length| and this is
+-	 * safe. If the padding was valid then we know that we have at least
+-	 * overhead+padding_length bytes of space and so this is still safe
+-	 * because overhead accounts for the explicit IV. */
+-	if (has_explicit_iv)
+-		{
+-		rec->data += block_size;
+-		rec->input += block_size;
+-		rec->length -= block_size;
+-		rec->orig_len -= block_size;
+-		}
++	padding_length = good & (padding_length+1);
++	rec->length -= padding_length;
++	rec->type |= padding_length<<8;	/* kludge: pass padding length */
+ 
+ 	return (int)((good & 1) | (~good & -1));
+ 	}
+ 
+-#if defined(_M_AMD64) || defined(__x86_64__)
+-#define CBC_MAC_ROTATE_IN_PLACE
+-#endif
+-
+ /* ssl3_cbc_copy_mac copies |md_size| bytes from the end of |rec| to |out| in
+  * constant time (independent of the concrete value of rec->length, which may
+  * vary within a 256-byte window).
+@@ -240,15 +238,18 @@ int tls1_cbc_remove_padding(const SSL* s,
+  *
+  * If CBC_MAC_ROTATE_IN_PLACE is defined then the rotation is performed with
+  * variable accesses in a 64-byte-aligned buffer. Assuming that this fits into
+- * a single cache-line, then the variable memory accesses don't actually affect
+- * the timing. This has been tested to be true on Intel amd64 chips.
++ * a single or pair of cache-lines, then the variable memory accesses don't
++ * actually affect the timing. CPUs with smaller cache-lines [if any] are
++ * not multi-core and are not considered vulnerable to cache-timing attacks.
+  */
++#define CBC_MAC_ROTATE_IN_PLACE
++
+ void ssl3_cbc_copy_mac(unsigned char* out,
+ 		       const SSL3_RECORD *rec,
+-		       unsigned md_size)
++		       unsigned md_size,unsigned orig_len)
+ 	{
+ #if defined(CBC_MAC_ROTATE_IN_PLACE)
+-	unsigned char rotated_mac_buf[EVP_MAX_MD_SIZE*2];
++	unsigned char rotated_mac_buf[64+EVP_MAX_MD_SIZE];
+ 	unsigned char *rotated_mac;
+ #else
+ 	unsigned char rotated_mac[EVP_MAX_MD_SIZE];
+@@ -264,16 +265,16 @@ void ssl3_cbc_copy_mac(unsigned char* out,
+ 	unsigned div_spoiler;
+ 	unsigned rotate_offset;
+ 
+-	OPENSSL_assert(rec->orig_len >= md_size);
++	OPENSSL_assert(orig_len >= md_size);
+ 	OPENSSL_assert(md_size <= EVP_MAX_MD_SIZE);
+ 
+ #if defined(CBC_MAC_ROTATE_IN_PLACE)
+-	rotated_mac = (unsigned char*) (((intptr_t)(rotated_mac_buf + 64)) & ~63);
++	rotated_mac = rotated_mac_buf + ((0-(size_t)rotated_mac_buf)&63);
+ #endif
+ 
+ 	/* This information is public so it's safe to branch based on it. */
+-	if (rec->orig_len > md_size + 255 + 1)
+-		scan_start = rec->orig_len - (md_size + 255 + 1);
++	if (orig_len > md_size + 255 + 1)
++		scan_start = orig_len - (md_size + 255 + 1);
+ 	/* div_spoiler contains a multiple of md_size that is used to cause the
+ 	 * modulo operation to be constant time. Without this, the time varies
+ 	 * based on the amount of padding when running on Intel chips at least.
+@@ -286,16 +287,13 @@ void ssl3_cbc_copy_mac(unsigned char* out,
+ 	rotate_offset = (div_spoiler + mac_start - scan_start) % md_size;
+ 
+ 	memset(rotated_mac, 0, md_size);
+-	for (i = scan_start; i < rec->orig_len;)
++	for (i = scan_start, j = 0; i < orig_len; i++)
+ 		{
+-		for (j = 0; j < md_size && i < rec->orig_len; i++, j++)
+-			{
+-			unsigned char mac_started = constant_time_ge(i, mac_start);
+-			unsigned char mac_ended = constant_time_ge(i, mac_end);
+-			unsigned char b = 0;
+-			b = rec->data[i];
+-			rotated_mac[j] |= b & mac_started & ~mac_ended;
+-			}
++		unsigned char mac_started = constant_time_ge(i, mac_start);
++		unsigned char mac_ended = constant_time_ge(i, mac_end);
++		unsigned char b = rec->data[i];
++		rotated_mac[j++] |= b & mac_started & ~mac_ended;
++		j &= constant_time_lt(j,md_size);
+ 		}
+ 
+ 	/* Now rotate the MAC */
+@@ -303,30 +301,43 @@ void ssl3_cbc_copy_mac(unsigned char* out,
+ 	j = 0;
+ 	for (i = 0; i < md_size; i++)
+ 		{
+-		unsigned char offset = (div_spoiler + rotate_offset + i) % md_size;
+-		out[j++] = rotated_mac[offset];
++		/* in case cache-line is 32 bytes, touch second line */
++		((volatile unsigned char *)rotated_mac)[rotate_offset^32];
++		out[j++] = rotated_mac[rotate_offset++];
++		rotate_offset &= constant_time_lt(rotate_offset,md_size);
+ 		}
+ #else
+ 	memset(out, 0, md_size);
++	rotate_offset = md_size - rotate_offset;
++	rotate_offset &= constant_time_lt(rotate_offset,md_size);
+ 	for (i = 0; i < md_size; i++)
+ 		{
+-		unsigned char offset = (div_spoiler + md_size - rotate_offset + i) % md_size;
+ 		for (j = 0; j < md_size; j++)
+-			out[j] |= rotated_mac[i] & constant_time_eq_8(j, offset);
++			out[j] |= rotated_mac[i] & constant_time_eq_8(j, rotate_offset);
++		rotate_offset++;
++		rotate_offset &= constant_time_lt(rotate_offset,md_size);
+ 		}
+ #endif
+ 	}
+ 
++/* u32toLE serialises an unsigned, 32-bit number (n) as four bytes at (p) in
++ * little-endian order. The value of p is advanced by four. */
++#define u32toLE(n, p) \
++	(*((p)++)=(unsigned char)(n), \
++	 *((p)++)=(unsigned char)(n>>8), \
++	 *((p)++)=(unsigned char)(n>>16), \
++	 *((p)++)=(unsigned char)(n>>24))
++
+ /* These functions serialize the state of a hash and thus perform the standard
+  * "final" operation without adding the padding and length that such a function
+  * typically does. */
+ static void tls1_md5_final_raw(void* ctx, unsigned char *md_out)
+ 	{
+ 	MD5_CTX *md5 = ctx;
+-	l2n(md5->A, md_out);
+-	l2n(md5->B, md_out);
+-	l2n(md5->C, md_out);
+-	l2n(md5->D, md_out);
++	u32toLE(md5->A, md_out);
++	u32toLE(md5->B, md_out);
++	u32toLE(md5->C, md_out);
++	u32toLE(md5->D, md_out);
+ 	}
+ 
+ static void tls1_sha1_final_raw(void* ctx, unsigned char *md_out)
+@@ -442,6 +453,7 @@ void ssl3_cbc_digest_record(
+ 	/* mdLengthSize is the number of bytes in the length field that terminates
+ 	* the hash. */
+ 	unsigned md_length_size = 8;
++	char length_is_big_endian = 1;
+ 
+ 	/* This is a, hopefully redundant, check that allows us to forget about
+ 	 * many possible overflows later in this function. */
+@@ -455,6 +467,7 @@ void ssl3_cbc_digest_record(
+ 			md_transform = (void(*)(void *ctx, const unsigned char *block)) MD5_Transform;
+ 			md_size = 16;
+ 			sslv3_pad_length = 48;
++			length_is_big_endian = 0;
+ 			break;
+ 		case NID_sha1:
+ 			SHA1_Init((SHA_CTX*)md_state.c);
+@@ -595,11 +608,22 @@ void ssl3_cbc_digest_record(
+ 		md_transform(md_state.c, hmac_pad);
+ 		}
+ 
+-	memset(length_bytes,0,md_length_size-4);
+-	length_bytes[md_length_size-4] = (unsigned char)(bits>>24);
+-	length_bytes[md_length_size-3] = (unsigned char)(bits>>16);
+-	length_bytes[md_length_size-2] = (unsigned char)(bits>>8);
+-	length_bytes[md_length_size-1] = (unsigned char)bits;
++	if (length_is_big_endian)
++		{
++		memset(length_bytes,0,md_length_size-4);
++		length_bytes[md_length_size-4] = (unsigned char)(bits>>24);
++		length_bytes[md_length_size-3] = (unsigned char)(bits>>16);
++		length_bytes[md_length_size-2] = (unsigned char)(bits>>8);
++		length_bytes[md_length_size-1] = (unsigned char)bits;
++		}
++	else
++		{
++		memset(length_bytes,0,md_length_size);
++		length_bytes[md_length_size-5] = (unsigned char)(bits>>24);
++		length_bytes[md_length_size-6] = (unsigned char)(bits>>16);
++		length_bytes[md_length_size-7] = (unsigned char)(bits>>8);
++		length_bytes[md_length_size-8] = (unsigned char)bits;
++		}
+ 
+ 	if (k > 0)
+ 		{
+diff --git a/ssl/s3_clnt.c b/ssl/s3_clnt.c
+index fc53161..f1f9c21 100644
+--- a/ssl/s3_clnt.c
++++ b/ssl/s3_clnt.c
+@@ -888,7 +888,10 @@ int ssl3_get_server_hello(SSL *s)
+ 		}
+ 	s->s3->tmp.new_cipher=c;
+ 	if (!ssl3_digest_cached_records(s))
++		{
++		al = SSL_AD_INTERNAL_ERROR;
+ 		goto f_err;
++		}
+ 
+ 	/* lets get the compression algorithm */
+ 	/* COMPRESSION */
+@@ -968,7 +971,9 @@ int ssl3_get_server_hello(SSL *s)
+ 	return(1);
+ f_err:
+ 	ssl3_send_alert(s,SSL3_AL_FATAL,al);
++#ifndef OPENSSL_NO_TLSEXT
+ err:
++#endif
+ 	return(-1);
+ 	}
+ 
+diff --git a/ssl/s3_enc.c b/ssl/s3_enc.c
+index 76d87b5..6bc0812 100644
+--- a/ssl/s3_enc.c
++++ b/ssl/s3_enc.c
+@@ -697,7 +697,7 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
+ 	EVP_MD_CTX md_ctx;
+ 	const EVP_MD_CTX *hash;
+ 	unsigned char *p,rec_char;
+-	size_t md_size;
++	size_t md_size, orig_len;
+ 	int npad;
+ 	int t;
+ 
+@@ -722,6 +722,10 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
+ 	md_size=t;
+ 	npad=(48/md_size)*md_size;
+ 
++	/* kludge: ssl3_cbc_remove_padding passes padding length in rec->type */
++	orig_len = rec->length+md_size+((unsigned int)rec->type>>8);
++	rec->type &= 0xff;
++
+ 	if (!send &&
+ 	    EVP_CIPHER_CTX_mode(ssl->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+ 	    ssl3_cbc_record_digest_supported(hash))
+@@ -753,7 +757,7 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
+ 			hash,
+ 			md, &md_size,
+ 			header, rec->input,
+-			rec->length + md_size, rec->orig_len,
++			rec->length + md_size, orig_len,
+ 			mac_sec, md_size,
+ 			1 /* is SSLv3 */);
+ 		}
+diff --git a/ssl/s3_pkt.c b/ssl/s3_pkt.c
+index 7d8fc53..a41279e 100644
+--- a/ssl/s3_pkt.c
++++ b/ssl/s3_pkt.c
+@@ -289,7 +289,7 @@ static int ssl3_get_record(SSL *s)
+ 	unsigned char *p;
+ 	unsigned char md[EVP_MAX_MD_SIZE];
+ 	short version;
+-	unsigned mac_size;
++	unsigned mac_size, orig_len;
+ 	size_t extra;
+ 
+ 	rr= &(s->s3->rrec);
+@@ -397,7 +397,6 @@ fprintf(stderr, "Record type=%d, Length=%d\n", rr->type, rr->length);
+ 
+ 	/* decrypt in place in 'rr->input' */
+ 	rr->data=rr->input;
+-	rr->orig_len=rr->length;
+ 
+ 	enc_err = s->method->ssl3_enc->enc(s,0);
+ 	/* enc_err is:
+@@ -428,15 +427,18 @@ printf("\n");
+ 		mac_size=EVP_MD_CTX_size(s->read_hash);
+ 		OPENSSL_assert(mac_size <= EVP_MAX_MD_SIZE);
+ 
++		/* kludge: *_cbc_remove_padding passes padding length in rr->type */
++		orig_len = rr->length+((unsigned int)rr->type>>8);
++
+ 		/* orig_len is the length of the record before any padding was
+ 		 * removed. This is public information, as is the MAC in use,
+ 		 * therefore we can safely process the record in a different
+ 		 * amount of time if it's too short to possibly contain a MAC.
+ 		 */
+-		if (rr->orig_len < mac_size ||
++		if (orig_len < mac_size ||
+ 		    /* CBC records must have a padding length byte too. */
+ 		    (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+-		     rr->orig_len < mac_size+1))
++		     orig_len < mac_size+1))
+ 			{
+ 			al=SSL_AD_DECODE_ERROR;
+ 			SSLerr(SSL_F_SSL3_GET_RECORD,SSL_R_LENGTH_TOO_SHORT);
+@@ -451,12 +453,12 @@ printf("\n");
+ 			 * without leaking the contents of the padding bytes.
+ 			 * */
+ 			mac = mac_tmp;
+-			ssl3_cbc_copy_mac(mac_tmp, rr, mac_size);
++			ssl3_cbc_copy_mac(mac_tmp, rr, mac_size, orig_len);
+ 			rr->length -= mac_size;
+ 			}
+ 		else
+ 			{
+-			/* In this case there's no padding, so |rec->orig_len|
++			/* In this case there's no padding, so |orig_len|
+ 			 * equals |rec->length| and we checked that there's
+ 			 * enough bytes for |mac_size| above. */
+ 			rr->length -= mac_size;
+diff --git a/ssl/s3_srvr.c b/ssl/s3_srvr.c
+index b4a6a37..14aa451 100644
+--- a/ssl/s3_srvr.c
++++ b/ssl/s3_srvr.c
+@@ -1269,7 +1269,10 @@ int ssl3_get_client_hello(SSL *s)
+ 		}
+ 
+ 	if (!ssl3_digest_cached_records(s))
++		{
++		al = SSL_AD_INTERNAL_ERROR;
+ 		goto f_err;
++		}
+ 	
+ 	/* we now have the following setup. 
+ 	 * client_random
+@@ -1282,6 +1285,7 @@ int ssl3_get_client_hello(SSL *s)
+ 	 * s->tmp.new_cipher	- the new cipher to use.
+ 	 */
+ 
++#ifndef OPENSSL_NO_TLSEXT
+ 	/* Handles TLS extensions that we couldn't check earlier */
+ 	if (s->version >= SSL3_VERSION)
+ 		{
+@@ -1291,6 +1295,7 @@ int ssl3_get_client_hello(SSL *s)
+ 			goto err;
+ 			}
+ 		}
++#endif
+ 
+ 	if (ret < 0) ret=1;
+ 	if (0)
+diff --git a/ssl/ssl3.h b/ssl/ssl3.h
+index d3bd768..9c2c412 100644
+--- a/ssl/ssl3.h
++++ b/ssl/ssl3.h
+@@ -349,10 +349,6 @@ typedef struct ssl3_record_st
+ /*r */	unsigned char *comp;    /* only used with decompression - malloc()ed */
+ /*r */  unsigned long epoch;    /* epoch number, needed by DTLS1 */
+ /*r */  unsigned char seq_num[8]; /* sequence number, needed by DTLS1 */
+-/*rw*/	unsigned int orig_len;  /* How many bytes were available before padding
+-				   was removed? This is used to implement the
+-				   MAC check in constant time for CBC records.
+-				 */
+ 	} SSL3_RECORD;
+ 
+ typedef struct ssl3_buffer_st
+diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
+index 25573e4..b3c21ea 100644
+--- a/ssl/ssl_lib.c
++++ b/ssl/ssl_lib.c
+@@ -2601,9 +2601,7 @@ void ssl_clear_cipher_ctx(SSL *s)
+ /* Fix this function so that it takes an optional type parameter */
+ X509 *SSL_get_certificate(const SSL *s)
+ 	{
+-	if (s->server)
+-		return(ssl_get_server_send_cert(s));
+-	else if (s->cert != NULL)
++	if (s->cert != NULL)
+ 		return(s->cert->key->x509);
+ 	else
+ 		return(NULL);
+diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h
+index 6a4f62a..b0dab18 100644
+--- a/ssl/ssl_locl.h
++++ b/ssl/ssl_locl.h
+@@ -1091,7 +1091,7 @@ int ssl_parse_clienthello_renegotiate_ext(SSL *s, unsigned char *d, int len,
+ /* s3_cbc.c */
+ void ssl3_cbc_copy_mac(unsigned char* out,
+ 		       const SSL3_RECORD *rec,
+-		       unsigned md_size);
++		       unsigned md_size,unsigned orig_len);
+ int ssl3_cbc_remove_padding(const SSL* s,
+ 			    SSL3_RECORD *rec,
+ 			    unsigned block_size,
+diff --git a/ssl/ssltest.c b/ssl/ssltest.c
+index 0bb4fa4..eaad524 100644
+--- a/ssl/ssltest.c
++++ b/ssl/ssltest.c
+@@ -782,7 +782,13 @@ bad:
+ 		meth=SSLv23_method();
+ #else
+ #ifdef OPENSSL_NO_SSL2
+-	meth=SSLv3_method();
++	if (tls1)
++		meth=TLSv1_method();
++	else
++	if (ssl3)
++		meth=SSLv3_method();
++	else
++		meth=SSLv23_method();
+ #else
+ 	meth=SSLv2_method();
+ #endif
+diff --git a/ssl/t1_enc.c b/ssl/t1_enc.c
+index c38dae6..d67f6f1 100644
+--- a/ssl/t1_enc.c
++++ b/ssl/t1_enc.c
+@@ -851,7 +851,7 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
+ 	SSL3_RECORD *rec;
+ 	unsigned char *seq;
+ 	EVP_MD_CTX *hash;
+-	size_t md_size;
++	size_t md_size, orig_len;
+ 	int i;
+ 	EVP_MD_CTX hmac, *mac_ctx;
+ 	unsigned char header[13];
+@@ -898,6 +898,10 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
+ 	else
+ 		memcpy(header, seq, 8);
+ 
++	/* kludge: tls1_cbc_remove_padding passes padding length in rec->type */
++	orig_len = rec->length+md_size+((unsigned int)rec->type>>8);
++	rec->type &= 0xff;
++
+ 	header[8]=rec->type;
+ 	header[9]=(unsigned char)(ssl->version>>8);
+ 	header[10]=(unsigned char)(ssl->version);
+@@ -916,7 +920,7 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
+ 			mac_ctx,
+ 			md, &md_size,
+ 			header, rec->input,
+-			rec->length + md_size, rec->orig_len,
++			rec->length + md_size, orig_len,
+ 			ssl->s3->read_mac_secret,
+ 			ssl->s3->read_mac_secret_size,
+ 			0 /* not SSLv3 */);
+diff --git a/test/cms-test.pl b/test/cms-test.pl
+index c938bcf..dfef799 100644
+--- a/test/cms-test.pl
++++ b/test/cms-test.pl
+@@ -415,8 +415,10 @@ sub run_smime_tests {
+ }
+ 
+ sub cmp_files {
++    use FileHandle;
+     my ( $f1, $f2 ) = @_;
+-    my ( $fp1, $fp2 );
++    my $fp1 = FileHandle->new();
++    my $fp2 = FileHandle->new();
+ 
+     my ( $rd1, $rd2 );
+ 
+diff --git a/test/testssl b/test/testssl
+index b55364a..04341e9 100644
+--- a/test/testssl
++++ b/test/testssl
+@@ -119,6 +119,23 @@ $ssltest -bio_pair -server_auth -client_auth $CA $extra || exit 1
+ echo test sslv2/sslv3 with both client and server authentication via BIO pair and app verify
+ $ssltest -bio_pair -server_auth -client_auth -app_verify $CA $extra || exit 1
+ 
++echo "Testing ciphersuites"
++for protocol in SSLv3; do
++  echo "Testing ciphersuites for $protocol"
++  for cipher in `../util/shlib_wrap.sh ../apps/openssl ciphers "RSA+$protocol" | tr ':' ' '`; do
++    echo "Testing $cipher"
++    prot=""
++    if [ $protocol == "SSLv3" ] ; then
++      prot="-ssl3"
++    fi
++    $ssltest -cipher $cipher $prot
++    if [ $? -ne 0 ] ; then
++	  echo "Failed $cipher"
++	  exit 1
++    fi
++  done
++done
++
+ #############################################################################
+ 
+ if ../util/shlib_wrap.sh ../apps/openssl no-dh; then
diff --git a/openssl-1.0.0f-fips.patch b/openssl-1.0.0k-fips.patch
similarity index 91%
rename from openssl-1.0.0f-fips.patch
rename to openssl-1.0.0k-fips.patch
index 435d72f..8b53d11 100644
--- a/openssl-1.0.0f-fips.patch
+++ b/openssl-1.0.0k-fips.patch
@@ -1,7 +1,7 @@
-diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
---- openssl-1.0.0f/Configure.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/Configure	2012-01-05 13:22:30.000000000 +0100
-@@ -663,6 +663,7 @@ my $cmll_enc="camellia.o cmll_misc.o cml
+diff -up openssl-1.0.0k/Configure.fips openssl-1.0.0k/Configure
+--- openssl-1.0.0k/Configure.fips	2013-02-19 20:12:54.536663757 +0100
++++ openssl-1.0.0k/Configure	2013-02-19 20:12:54.574664476 +0100
+@@ -664,6 +664,7 @@ my $cmll_enc="camellia.o cmll_misc.o cml
  my $processor="";
  my $default_ranlib;
  my $perl;
@@ -9,7 +9,7 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
  
  
  # All of the following is disabled by default (RC5 was enabled before 0.9.8):
-@@ -809,6 +810,10 @@ PROCESS_ARGS:
+@@ -810,6 +811,10 @@ PROCESS_ARGS:
  			}
  		elsif (/^386$/)
  			{ $processor=386; }
@@ -20,7 +20,7 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
  		elsif (/^rsaref$/)
  			{
  			# No RSAref support any more since it's not needed.
-@@ -1383,6 +1388,11 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no
+@@ -1386,6 +1391,11 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no
  
  $cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /-mont/);
  
@@ -32,7 +32,7 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
  $cpuid_obj="mem_clr.o"	unless ($cpuid_obj =~ /\.o$/);
  $des_obj=$des_enc	unless ($des_obj =~ /\.o$/);
  $bf_obj=$bf_enc		unless ($bf_obj =~ /\.o$/);
-@@ -1550,6 +1560,10 @@ while (<IN>)
+@@ -1553,6 +1563,10 @@ while (<IN>)
  	s/^LIBKRB5=.*/LIBKRB5=$withargs{"krb5-lib"}/;
  	s/^LIBZLIB=.*/LIBZLIB=$withargs{"zlib-lib"}/;
  	s/^ZLIB_INCLUDE=.*/ZLIB_INCLUDE=$withargs{"zlib-include"}/;
@@ -43,9 +43,9 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
  	s/^SHLIB_TARGET=.*/SHLIB_TARGET=$shared_target/;
  	s/^SHLIB_MARK=.*/SHLIB_MARK=$shared_mark/;
  	s/^SHARED_LIBS=.*/SHARED_LIBS=\$(SHARED_CRYPTO) \$(SHARED_SSL)/ if (!$no_shared);
-diff -up openssl-1.0.0f/crypto/bf/bf_skey.c.fips openssl-1.0.0f/crypto/bf/bf_skey.c
---- openssl-1.0.0f/crypto/bf/bf_skey.c.fips	2008-11-12 04:57:52.000000000 +0100
-+++ openssl-1.0.0f/crypto/bf/bf_skey.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bf/bf_skey.c.fips openssl-1.0.0k/crypto/bf/bf_skey.c
+--- openssl-1.0.0k/crypto/bf/bf_skey.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/bf/bf_skey.c	2013-02-19 20:12:54.574664476 +0100
 @@ -59,10 +59,15 @@
  #include <stdio.h>
  #include <string.h>
@@ -63,9 +63,9 @@ diff -up openssl-1.0.0f/crypto/bf/bf_skey.c.fips openssl-1.0.0f/crypto/bf/bf_ske
  	{
  	int i;
  	BF_LONG *p,ri,in[2];
-diff -up openssl-1.0.0f/crypto/bf/blowfish.h.fips openssl-1.0.0f/crypto/bf/blowfish.h
---- openssl-1.0.0f/crypto/bf/blowfish.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/bf/blowfish.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bf/blowfish.h.fips openssl-1.0.0k/crypto/bf/blowfish.h
+--- openssl-1.0.0k/crypto/bf/blowfish.h.fips	2013-02-19 20:12:53.998653547 +0100
++++ openssl-1.0.0k/crypto/bf/blowfish.h	2013-02-19 20:12:54.575664496 +0100
 @@ -104,7 +104,9 @@ typedef struct bf_key_st
  	BF_LONG S[4*256];
  	} BF_KEY;
@@ -77,9 +77,9 @@ diff -up openssl-1.0.0f/crypto/bf/blowfish.h.fips openssl-1.0.0f/crypto/bf/blowf
  void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
  
  void BF_encrypt(BF_LONG *data,const BF_KEY *key);
-diff -up openssl-1.0.0f/crypto/bn/bn.h.fips openssl-1.0.0f/crypto/bn/bn.h
---- openssl-1.0.0f/crypto/bn/bn.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/bn/bn.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bn/bn.h.fips openssl-1.0.0k/crypto/bn/bn.h
+--- openssl-1.0.0k/crypto/bn/bn.h.fips	2013-02-19 20:12:54.135656147 +0100
++++ openssl-1.0.0k/crypto/bn/bn.h	2013-02-19 20:12:54.575664496 +0100
 @@ -558,6 +558,17 @@ int	BN_is_prime_ex(const BIGNUM *p,int n
  int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
  		int do_trial_division, BN_GENCB *cb);
@@ -98,9 +98,9 @@ diff -up openssl-1.0.0f/crypto/bn/bn.h.fips openssl-1.0.0f/crypto/bn/bn.h
  BN_MONT_CTX *BN_MONT_CTX_new(void );
  void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
  int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
-diff -up openssl-1.0.0f/crypto/bn/bn_x931p.c.fips openssl-1.0.0f/crypto/bn/bn_x931p.c
---- openssl-1.0.0f/crypto/bn/bn_x931p.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/bn/bn_x931p.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bn/bn_x931p.c.fips openssl-1.0.0k/crypto/bn/bn_x931p.c
+--- openssl-1.0.0k/crypto/bn/bn_x931p.c.fips	2013-02-19 20:12:54.575664496 +0100
++++ openssl-1.0.0k/crypto/bn/bn_x931p.c	2013-02-19 20:12:54.576664516 +0100
 @@ -0,0 +1,272 @@
 +/* bn_x931p.c */
 +/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -374,9 +374,9 @@ diff -up openssl-1.0.0f/crypto/bn/bn_x931p.c.fips openssl-1.0.0f/crypto/bn/bn_x9
 +
 +	}
 +
-diff -up openssl-1.0.0f/crypto/bn/Makefile.fips openssl-1.0.0f/crypto/bn/Makefile
---- openssl-1.0.0f/crypto/bn/Makefile.fips	2008-11-12 09:19:02.000000000 +0100
-+++ openssl-1.0.0f/crypto/bn/Makefile	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bn/Makefile.fips openssl-1.0.0k/crypto/bn/Makefile
+--- openssl-1.0.0k/crypto/bn/Makefile.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/bn/Makefile	2013-02-19 20:12:54.576664516 +0100
 @@ -26,13 +26,13 @@ LIBSRC=	bn_add.c bn_div.c bn_exp.c bn_li
  	bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \
  	bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c bn_asm.c \
@@ -393,9 +393,9 @@ diff -up openssl-1.0.0f/crypto/bn/Makefile.fips openssl-1.0.0f/crypto/bn/Makefil
  
  SRC= $(LIBSRC)
  
-diff -up openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl.fips openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl
---- openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl.fips	2009-04-06 16:25:02.000000000 +0200
-+++ openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl.fips openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl
+--- openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl	2013-02-19 20:12:54.576664516 +0100
 @@ -722,12 +722,15 @@ my $bias=int(@T[0])?shift(@T):0;
  }
  &function_end("Camellia_Ekeygen");
@@ -422,10 +422,10 @@ diff -up openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl.fips openssl-1.0.0f/cryp
  }
  
  @SBOX=(
-diff -up openssl-1.0.0f/crypto/camellia/camellia.h.fips openssl-1.0.0f/crypto/camellia/camellia.h
---- openssl-1.0.0f/crypto/camellia/camellia.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/camellia.h	2012-01-05 13:22:30.000000000 +0100
-@@ -88,6 +88,11 @@ struct camellia_key_st 
+diff -up openssl-1.0.0k/crypto/camellia/camellia.h.fips openssl-1.0.0k/crypto/camellia/camellia.h
+--- openssl-1.0.0k/crypto/camellia/camellia.h.fips	2013-02-19 20:12:53.926652181 +0100
++++ openssl-1.0.0k/crypto/camellia/camellia.h	2013-02-19 20:12:54.577664536 +0100
+@@ -88,6 +88,11 @@ struct camellia_key_st
  	};
  typedef struct camellia_key_st CAMELLIA_KEY;
  
@@ -437,9 +437,9 @@ diff -up openssl-1.0.0f/crypto/camellia/camellia.h.fips openssl-1.0.0f/crypto/ca
  int Camellia_set_key(const unsigned char *userKey, const int bits,
  	CAMELLIA_KEY *key);
  
-diff -up openssl-1.0.0f/crypto/camellia/cmll_fblk.c.fips openssl-1.0.0f/crypto/camellia/cmll_fblk.c
---- openssl-1.0.0f/crypto/camellia/cmll_fblk.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/cmll_fblk.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/cmll_fblk.c.fips openssl-1.0.0k/crypto/camellia/cmll_fblk.c
+--- openssl-1.0.0k/crypto/camellia/cmll_fblk.c.fips	2013-02-19 20:12:54.577664536 +0100
++++ openssl-1.0.0k/crypto/camellia/cmll_fblk.c	2013-02-19 20:12:54.577664536 +0100
 @@ -0,0 +1,68 @@
 +/* crypto/camellia/camellia_misc.c -*- mode:C; c-file-style: "eay" -*- */
 +/* ====================================================================
@@ -509,9 +509,9 @@ diff -up openssl-1.0.0f/crypto/camellia/cmll_fblk.c.fips openssl-1.0.0f/crypto/c
 +	return private_Camellia_set_key(userKey, bits, key);
 +	}
 +#endif
-diff -up openssl-1.0.0f/crypto/camellia/cmll_misc.c.fips openssl-1.0.0f/crypto/camellia/cmll_misc.c
---- openssl-1.0.0f/crypto/camellia/cmll_misc.c.fips	2008-10-28 13:13:52.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/cmll_misc.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/cmll_misc.c.fips openssl-1.0.0k/crypto/camellia/cmll_misc.c
+--- openssl-1.0.0k/crypto/camellia/cmll_misc.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/camellia/cmll_misc.c	2013-02-19 20:12:54.577664536 +0100
 @@ -52,11 +52,20 @@
  #include <openssl/opensslv.h>
  #include <openssl/camellia.h>
@@ -533,9 +533,9 @@ diff -up openssl-1.0.0f/crypto/camellia/cmll_misc.c.fips openssl-1.0.0f/crypto/c
  	{
  	if(!userKey || !key)
  		return -1;
-diff -up openssl-1.0.0f/crypto/camellia/Makefile.fips openssl-1.0.0f/crypto/camellia/Makefile
---- openssl-1.0.0f/crypto/camellia/Makefile.fips	2008-12-23 12:33:00.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/Makefile	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/Makefile.fips openssl-1.0.0k/crypto/camellia/Makefile
+--- openssl-1.0.0k/crypto/camellia/Makefile.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/camellia/Makefile	2013-02-19 20:12:54.578664555 +0100
 @@ -23,9 +23,9 @@ APPS=
  
  LIB=$(TOP)/libcrypto.a
@@ -548,9 +548,9 @@ diff -up openssl-1.0.0f/crypto/camellia/Makefile.fips openssl-1.0.0f/crypto/came
  
  SRC= $(LIBSRC)
  
-diff -up openssl-1.0.0f/crypto/cast/cast.h.fips openssl-1.0.0f/crypto/cast/cast.h
---- openssl-1.0.0f/crypto/cast/cast.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/cast/cast.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/cast/cast.h.fips openssl-1.0.0k/crypto/cast/cast.h
+--- openssl-1.0.0k/crypto/cast/cast.h.fips	2013-02-19 20:12:54.363660475 +0100
++++ openssl-1.0.0k/crypto/cast/cast.h	2013-02-19 20:12:54.578664555 +0100
 @@ -83,7 +83,9 @@ typedef struct cast_key_st
  	int short_key;	/* Use reduced rounds for short key */
  	} CAST_KEY;
@@ -562,9 +562,9 @@ diff -up openssl-1.0.0f/crypto/cast/cast.h.fips openssl-1.0.0f/crypto/cast/cast.
  void CAST_set_key(CAST_KEY *key, int len, const unsigned char *data);
  void CAST_ecb_encrypt(const unsigned char *in, unsigned char *out, const CAST_KEY *key,
  		      int enc);
-diff -up openssl-1.0.0f/crypto/cast/c_skey.c.fips openssl-1.0.0f/crypto/cast/c_skey.c
---- openssl-1.0.0f/crypto/cast/c_skey.c.fips	2000-06-03 16:13:35.000000000 +0200
-+++ openssl-1.0.0f/crypto/cast/c_skey.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/cast/c_skey.c.fips openssl-1.0.0k/crypto/cast/c_skey.c
+--- openssl-1.0.0k/crypto/cast/c_skey.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/cast/c_skey.c	2013-02-19 20:12:54.578664555 +0100
 @@ -57,6 +57,11 @@
   */
  
@@ -586,12 +586,12 @@ diff -up openssl-1.0.0f/crypto/cast/c_skey.c.fips openssl-1.0.0f/crypto/cast/c_s
  	{
  	CAST_LONG x[16];
  	CAST_LONG z[16];
-diff -up openssl-1.0.0f/crypto/crypto.h.fips openssl-1.0.0f/crypto/crypto.h
---- openssl-1.0.0f/crypto/crypto.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/crypto.h	2012-01-05 13:22:30.000000000 +0100
-@@ -547,12 +547,70 @@ unsigned long *OPENSSL_ia32cap_loc(void)
- #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
- int OPENSSL_isservice(void);
+diff -up openssl-1.0.0k/crypto/crypto.h.fips openssl-1.0.0k/crypto/crypto.h
+--- openssl-1.0.0k/crypto/crypto.h.fips	2013-02-19 20:12:54.000000000 +0100
++++ openssl-1.0.0k/crypto/crypto.h	2013-02-19 20:14:08.209061781 +0100
+@@ -554,12 +554,70 @@ int OPENSSL_isservice(void);
+  * non-zero. */
+ int CRYPTO_memcmp(const void *a, const void *b, size_t len);
  
 +
 +#ifdef OPENSSL_FIPS
@@ -660,9 +660,9 @@ diff -up openssl-1.0.0f/crypto/crypto.h.fips openssl-1.0.0f/crypto/crypto.h
  /* Error codes for the CRYPTO functions. */
  
  /* Function codes. */
-diff -up openssl-1.0.0f/crypto/dh/dh_err.c.fips openssl-1.0.0f/crypto/dh/dh_err.c
---- openssl-1.0.0f/crypto/dh/dh_err.c.fips	2006-11-21 22:29:37.000000000 +0100
-+++ openssl-1.0.0f/crypto/dh/dh_err.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh_err.c.fips openssl-1.0.0k/crypto/dh/dh_err.c
+--- openssl-1.0.0k/crypto/dh/dh_err.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dh/dh_err.c	2013-02-19 20:12:54.579664573 +0100
 @@ -73,6 +73,8 @@ static ERR_STRING_DATA DH_str_functs[]=
  {ERR_FUNC(DH_F_COMPUTE_KEY),	"COMPUTE_KEY"},
  {ERR_FUNC(DH_F_DHPARAMS_PRINT_FP),	"DHparams_print_fp"},
@@ -680,9 +680,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh_err.c.fips openssl-1.0.0f/crypto/dh/dh_err.
  {ERR_REASON(DH_R_KEYS_NOT_SET)           ,"keys not set"},
  {ERR_REASON(DH_R_MODULUS_TOO_LARGE)      ,"modulus too large"},
  {ERR_REASON(DH_R_NO_PARAMETERS_SET)      ,"no parameters set"},
-diff -up openssl-1.0.0f/crypto/dh/dh_gen.c.fips openssl-1.0.0f/crypto/dh/dh_gen.c
---- openssl-1.0.0f/crypto/dh/dh_gen.c.fips	2005-04-26 20:53:15.000000000 +0200
-+++ openssl-1.0.0f/crypto/dh/dh_gen.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh_gen.c.fips openssl-1.0.0k/crypto/dh/dh_gen.c
+--- openssl-1.0.0k/crypto/dh/dh_gen.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dh/dh_gen.c	2013-02-19 20:12:54.579664573 +0100
 @@ -65,6 +65,10 @@
  #include "cryptlib.h"
  #include <openssl/bn.h>
@@ -715,9 +715,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh_gen.c.fips openssl-1.0.0f/crypto/dh/dh_gen.
  	ctx=BN_CTX_new();
  	if (ctx == NULL) goto err;
  	BN_CTX_start(ctx);
-diff -up openssl-1.0.0f/crypto/dh/dh.h.fips openssl-1.0.0f/crypto/dh/dh.h
---- openssl-1.0.0f/crypto/dh/dh.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/dh/dh.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh.h.fips openssl-1.0.0k/crypto/dh/dh.h
+--- openssl-1.0.0k/crypto/dh/dh.h.fips	2013-02-19 20:12:54.259658499 +0100
++++ openssl-1.0.0k/crypto/dh/dh.h	2013-02-19 20:12:54.580664592 +0100
 @@ -77,6 +77,8 @@
  # define OPENSSL_DH_MAX_MODULUS_BITS	10000
  #endif
@@ -744,9 +744,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh.h.fips openssl-1.0.0f/crypto/dh/dh.h
  
  #ifdef  __cplusplus
  }
-diff -up openssl-1.0.0f/crypto/dh/dh_key.c.fips openssl-1.0.0f/crypto/dh/dh_key.c
---- openssl-1.0.0f/crypto/dh/dh_key.c.fips	2007-03-28 02:15:23.000000000 +0200
-+++ openssl-1.0.0f/crypto/dh/dh_key.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh_key.c.fips openssl-1.0.0k/crypto/dh/dh_key.c
+--- openssl-1.0.0k/crypto/dh/dh_key.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dh/dh_key.c	2013-02-19 20:12:54.580664592 +0100
 @@ -61,6 +61,9 @@
  #include <openssl/bn.h>
  #include <openssl/rand.h>
@@ -796,9 +796,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh_key.c.fips openssl-1.0.0f/crypto/dh/dh_key.
  	dh->flags |= DH_FLAG_CACHE_MONT_P;
  	return(1);
  	}
-diff -up openssl-1.0.0f/crypto/dsa/dsa_gen.c.fips openssl-1.0.0f/crypto/dsa/dsa_gen.c
---- openssl-1.0.0f/crypto/dsa/dsa_gen.c.fips	2010-06-15 19:25:07.000000000 +0200
-+++ openssl-1.0.0f/crypto/dsa/dsa_gen.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa_gen.c.fips openssl-1.0.0k/crypto/dsa/dsa_gen.c
+--- openssl-1.0.0k/crypto/dsa/dsa_gen.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa_gen.c	2013-02-19 20:12:54.580664592 +0100
 @@ -77,8 +77,12 @@
  #include "cryptlib.h"
  #include <openssl/evp.h>
@@ -834,9 +834,9 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_gen.c.fips openssl-1.0.0f/crypto/dsa/dsa_
  	if (qsize != SHA_DIGEST_LENGTH && qsize != SHA224_DIGEST_LENGTH &&
  	    qsize != SHA256_DIGEST_LENGTH)
  		/* invalid q size */
-diff -up openssl-1.0.0f/crypto/dsa/dsa.h.fips openssl-1.0.0f/crypto/dsa/dsa.h
---- openssl-1.0.0f/crypto/dsa/dsa.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/dsa/dsa.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa.h.fips openssl-1.0.0k/crypto/dsa/dsa.h
+--- openssl-1.0.0k/crypto/dsa/dsa.h.fips	2013-02-19 20:12:54.099655464 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa.h	2013-02-19 20:12:54.581664610 +0100
 @@ -88,6 +88,8 @@
  # define OPENSSL_DSA_MAX_MODULUS_BITS	10000
  #endif
@@ -893,9 +893,9 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa.h.fips openssl-1.0.0f/crypto/dsa/dsa.h
  #define DSA_R_PARAMETER_ENCODING_ERROR			 105
  
  #ifdef  __cplusplus
-diff -up openssl-1.0.0f/crypto/dsa/dsa_key.c.fips openssl-1.0.0f/crypto/dsa/dsa_key.c
---- openssl-1.0.0f/crypto/dsa/dsa_key.c.fips	2007-03-28 02:15:25.000000000 +0200
-+++ openssl-1.0.0f/crypto/dsa/dsa_key.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa_key.c.fips openssl-1.0.0k/crypto/dsa/dsa_key.c
+--- openssl-1.0.0k/crypto/dsa/dsa_key.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa_key.c	2013-02-19 20:12:54.581664610 +0100
 @@ -63,9 +63,55 @@
  #include <openssl/bn.h>
  #include <openssl/dsa.h>
@@ -983,9 +983,9 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_key.c.fips openssl-1.0.0f/crypto/dsa/dsa_
  	ok=1;
  
  err:
-diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa_ossl.c
---- openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips	2011-02-01 13:54:04.000000000 +0100
-+++ openssl-1.0.0f/crypto/dsa/dsa_ossl.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0k/crypto/dsa/dsa_ossl.c
+--- openssl-1.0.0k/crypto/dsa/dsa_ossl.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa_ossl.c	2013-02-19 20:12:54.582664628 +0100
 @@ -65,6 +65,9 @@
  #include <openssl/dsa.h>
  #include <openssl/rand.h>
@@ -1026,7 +1026,7 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa
  	BN_init(&m);
  	BN_init(&xr);
  
-@@ -303,6 +320,20 @@ static int dsa_do_verify(const unsigned 
+@@ -303,6 +320,20 @@ static int dsa_do_verify(const unsigned
  		return -1;
  		}
  
@@ -1047,7 +1047,7 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa
  	i = BN_num_bits(dsa->q);
  	/* fips 186-3 allows only different sizes for q */
  	if (i != 160 && i != 224 && i != 256)
-@@ -385,6 +416,9 @@ static int dsa_do_verify(const unsigned 
+@@ -385,6 +416,9 @@ static int dsa_do_verify(const unsigned
  
  static int dsa_init(DSA *dsa)
  {
@@ -1057,10 +1057,10 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa
  	dsa->flags|=DSA_FLAG_CACHE_MONT_P;
  	return(1);
  }
-diff -up openssl-1.0.0f/crypto/err/err_all.c.fips openssl-1.0.0f/crypto/err/err_all.c
---- openssl-1.0.0f/crypto/err/err_all.c.fips	2009-08-09 16:58:05.000000000 +0200
-+++ openssl-1.0.0f/crypto/err/err_all.c	2012-01-05 13:22:30.000000000 +0100
-@@ -96,6 +96,9 @@
+diff -up openssl-1.0.0k/crypto/err/err_all.c.fips openssl-1.0.0k/crypto/err/err_all.c
+--- openssl-1.0.0k/crypto/err/err_all.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/err/err_all.c	2013-02-19 20:12:54.582664628 +0100
+@@ -98,6 +98,9 @@
  #include <openssl/ocsp.h>
  #include <openssl/err.h>
  #include <openssl/ts.h>
@@ -1070,7 +1070,7 @@ diff -up openssl-1.0.0f/crypto/err/err_all.c.fips openssl-1.0.0f/crypto/err/err_
  #ifndef OPENSSL_NO_CMS
  #include <openssl/cms.h>
  #endif
-@@ -149,6 +152,9 @@ void ERR_load_crypto_strings(void)
+@@ -152,6 +155,9 @@ void ERR_load_crypto_strings(void)
  #endif
  	ERR_load_OCSP_strings();
  	ERR_load_UI_strings();
@@ -1080,9 +1080,9 @@ diff -up openssl-1.0.0f/crypto/err/err_all.c.fips openssl-1.0.0f/crypto/err/err_
  #ifndef OPENSSL_NO_CMS
  	ERR_load_CMS_strings();
  #endif
-diff -up openssl-1.0.0f/crypto/evp/digest.c.fips openssl-1.0.0f/crypto/evp/digest.c
---- openssl-1.0.0f/crypto/evp/digest.c.fips	2010-03-05 14:33:43.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/digest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/digest.c.fips openssl-1.0.0k/crypto/evp/digest.c
+--- openssl-1.0.0k/crypto/evp/digest.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/digest.c	2013-02-19 20:12:54.582664628 +0100
 @@ -116,6 +116,7 @@
  #ifndef OPENSSL_NO_ENGINE
  #include <openssl/engine.h>
@@ -1181,9 +1181,9 @@ diff -up openssl-1.0.0f/crypto/evp/digest.c.fips openssl-1.0.0f/crypto/evp/diges
  
  	OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
  	ret=ctx->digest->final(ctx,md);
-diff -up openssl-1.0.0f/crypto/evp/e_aes.c.fips openssl-1.0.0f/crypto/evp/e_aes.c
---- openssl-1.0.0f/crypto/evp/e_aes.c.fips	2004-01-28 20:05:33.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_aes.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_aes.c.fips openssl-1.0.0k/crypto/evp/e_aes.c
+--- openssl-1.0.0k/crypto/evp/e_aes.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_aes.c	2013-02-19 20:12:54.583664647 +0100
 @@ -69,32 +69,29 @@ typedef struct
  
  IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
@@ -1210,35 +1210,32 @@ diff -up openssl-1.0.0f/crypto/evp/e_aes.c.fips openssl-1.0.0f/crypto/evp/e_aes.
 -		       EVP_CIPHER_set_asn1_iv,
 -		       EVP_CIPHER_get_asn1_iv,
 -		       NULL)
--
--#define IMPLEMENT_AES_CFBR(ksize,cbits)	IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
--
--IMPLEMENT_AES_CFBR(128,1)
--IMPLEMENT_AES_CFBR(192,1)
--IMPLEMENT_AES_CFBR(256,1)
--
--IMPLEMENT_AES_CFBR(128,8)
--IMPLEMENT_AES_CFBR(192,8)
--IMPLEMENT_AES_CFBR(256,8)
 +		       EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_DEFAULT_ASN1,
 +		       aes_init_key,
 +		       NULL, NULL, NULL, NULL)
-+
+ 
+-#define IMPLEMENT_AES_CFBR(ksize,cbits)	IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
 +#define IMPLEMENT_AES_CFBR(ksize,cbits,flags)	IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16,flags)
-+
+ 
+-IMPLEMENT_AES_CFBR(128,1)
+-IMPLEMENT_AES_CFBR(192,1)
+-IMPLEMENT_AES_CFBR(256,1)
 +IMPLEMENT_AES_CFBR(128,1,EVP_CIPH_FLAG_FIPS)
 +IMPLEMENT_AES_CFBR(192,1,EVP_CIPH_FLAG_FIPS)
 +IMPLEMENT_AES_CFBR(256,1,EVP_CIPH_FLAG_FIPS)
-+
+ 
+-IMPLEMENT_AES_CFBR(128,8)
+-IMPLEMENT_AES_CFBR(192,8)
+-IMPLEMENT_AES_CFBR(256,8)
 +IMPLEMENT_AES_CFBR(128,8,EVP_CIPH_FLAG_FIPS)
 +IMPLEMENT_AES_CFBR(192,8,EVP_CIPH_FLAG_FIPS)
 +IMPLEMENT_AES_CFBR(256,8,EVP_CIPH_FLAG_FIPS)
  
  static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
  		   const unsigned char *iv, int enc)
-diff -up openssl-1.0.0f/crypto/evp/e_camellia.c.fips openssl-1.0.0f/crypto/evp/e_camellia.c
---- openssl-1.0.0f/crypto/evp/e_camellia.c.fips	2006-08-31 22:56:20.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/e_camellia.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_camellia.c.fips openssl-1.0.0k/crypto/evp/e_camellia.c
+--- openssl-1.0.0k/crypto/evp/e_camellia.c.fips	2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_camellia.c	2013-02-19 20:12:54.583664647 +0100
 @@ -93,7 +93,7 @@ IMPLEMENT_BLOCK_CIPHER(camellia_256, ks,
  	EVP_CIPHER_get_asn1_iv,
  	NULL)
@@ -1248,9 +1245,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_camellia.c.fips openssl-1.0.0f/crypto/evp/e
  
  IMPLEMENT_CAMELLIA_CFBR(128,1)
  IMPLEMENT_CAMELLIA_CFBR(192,1)
-diff -up openssl-1.0.0f/crypto/evp/e_des3.c.fips openssl-1.0.0f/crypto/evp/e_des3.c
---- openssl-1.0.0f/crypto/evp/e_des3.c.fips	2008-12-29 13:35:47.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_des3.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_des3.c.fips openssl-1.0.0k/crypto/evp/e_des3.c
+--- openssl-1.0.0k/crypto/evp/e_des3.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_des3.c	2013-02-19 20:12:54.583664647 +0100
 @@ -206,9 +206,9 @@ static int des_ede3_cfb8_cipher(EVP_CIPH
      }
  
@@ -1264,7 +1261,7 @@ diff -up openssl-1.0.0f/crypto/evp/e_des3.c.fips openssl-1.0.0f/crypto/evp/e_des
  			des3_ctrl)
  
  #define des_ede3_cfb64_cipher des_ede_cfb64_cipher
-@@ -217,21 +217,21 @@ BLOCK_CIPHER_defs(des_ede, DES_EDE_KEY, 
+@@ -217,21 +217,21 @@ BLOCK_CIPHER_defs(des_ede, DES_EDE_KEY,
  #define des_ede3_ecb_cipher des_ede_ecb_cipher
  
  BLOCK_CIPHER_defs(des_ede3, DES_EDE_KEY, NID_des_ede3, 8, 24, 8, 64,
@@ -1295,9 +1292,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_des3.c.fips openssl-1.0.0f/crypto/evp/e_des
  		     des3_ctrl)
  
  static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-diff -up openssl-1.0.0f/crypto/evp/e_null.c.fips openssl-1.0.0f/crypto/evp/e_null.c
---- openssl-1.0.0f/crypto/evp/e_null.c.fips	2008-10-31 20:48:24.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_null.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_null.c.fips openssl-1.0.0k/crypto/evp/e_null.c
+--- openssl-1.0.0k/crypto/evp/e_null.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_null.c	2013-02-19 20:12:54.584664666 +0100
 @@ -69,7 +69,7 @@ static const EVP_CIPHER n_cipher=
  	{
  	NID_undef,
@@ -1307,9 +1304,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_null.c.fips openssl-1.0.0f/crypto/evp/e_nul
  	null_init_key,
  	null_cipher,
  	NULL,
-diff -up openssl-1.0.0f/crypto/evp/e_rc4.c.fips openssl-1.0.0f/crypto/evp/e_rc4.c
---- openssl-1.0.0f/crypto/evp/e_rc4.c.fips	2008-10-31 20:48:24.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_rc4.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_rc4.c.fips openssl-1.0.0k/crypto/evp/e_rc4.c
+--- openssl-1.0.0k/crypto/evp/e_rc4.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_rc4.c	2013-02-19 20:12:54.584664666 +0100
 @@ -64,6 +64,7 @@
  #include <openssl/evp.h>
  #include <openssl/objects.h>
@@ -1318,9 +1315,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_rc4.c.fips openssl-1.0.0f/crypto/evp/e_rc4.
  
  /* FIXME: surely this is available elsewhere? */
  #define EVP_RC4_KEY_SIZE		16
-diff -up openssl-1.0.0f/crypto/evp/evp_enc.c.fips openssl-1.0.0f/crypto/evp/evp_enc.c
---- openssl-1.0.0f/crypto/evp/evp_enc.c.fips	2010-10-12 01:24:49.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/evp_enc.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_enc.c.fips openssl-1.0.0k/crypto/evp/evp_enc.c
+--- openssl-1.0.0k/crypto/evp/evp_enc.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/evp_enc.c	2013-02-19 20:12:54.584664666 +0100
 @@ -68,8 +68,53 @@
  
  const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT;
@@ -1413,9 +1410,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp_enc.c.fips openssl-1.0.0f/crypto/evp/evp_
  	if(key || (ctx->cipher->flags & EVP_CIPH_ALWAYS_CALL_INIT)) {
  		if(!ctx->cipher->init(ctx,key,iv,enc)) return 0;
  	}
-diff -up openssl-1.0.0f/crypto/evp/evp_err.c.fips openssl-1.0.0f/crypto/evp/evp_err.c
---- openssl-1.0.0f/crypto/evp/evp_err.c.fips	2010-02-07 14:41:23.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp_err.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_err.c.fips openssl-1.0.0k/crypto/evp/evp_err.c
+--- openssl-1.0.0k/crypto/evp/evp_err.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/evp_err.c	2013-02-19 20:12:54.585664685 +0100
 @@ -155,6 +155,7 @@ static ERR_STRING_DATA EVP_str_reasons[]
  {ERR_REASON(EVP_R_DECODE_ERROR)          ,"decode error"},
  {ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES)   ,"different key types"},
@@ -1424,9 +1421,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp_err.c.fips openssl-1.0.0f/crypto/evp/evp_
  {ERR_REASON(EVP_R_ENCODE_ERROR)          ,"encode error"},
  {ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"},
  {ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY)  ,"expecting an rsa key"},
-diff -up openssl-1.0.0f/crypto/evp/evp.h.fips openssl-1.0.0f/crypto/evp/evp.h
---- openssl-1.0.0f/crypto/evp/evp.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp.h.fips openssl-1.0.0k/crypto/evp/evp.h
+--- openssl-1.0.0k/crypto/evp/evp.h.fips	2013-02-19 20:12:54.344660112 +0100
++++ openssl-1.0.0k/crypto/evp/evp.h	2013-02-19 20:12:54.585664685 +0100
 @@ -75,6 +75,10 @@
  #include <openssl/bio.h>
  #endif
@@ -1496,9 +1493,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp.h.fips openssl-1.0.0f/crypto/evp/evp.h
  #define EVP_R_ENCODE_ERROR				 115
  #define EVP_R_EVP_PBE_CIPHERINIT_ERROR			 119
  #define EVP_R_EXPECTING_AN_RSA_KEY			 127
-diff -up openssl-1.0.0f/crypto/evp/evp_lib.c.fips openssl-1.0.0f/crypto/evp/evp_lib.c
---- openssl-1.0.0f/crypto/evp/evp_lib.c.fips	2010-01-26 15:33:51.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp_lib.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_lib.c.fips openssl-1.0.0k/crypto/evp/evp_lib.c
+--- openssl-1.0.0k/crypto/evp/evp_lib.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/evp_lib.c	2013-02-19 20:12:54.586664704 +0100
 @@ -67,6 +67,8 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_
  
  	if (c->cipher->set_asn1_parameters != NULL)
@@ -1527,9 +1524,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp_lib.c.fips openssl-1.0.0f/crypto/evp/evp_
  	return ctx->cipher->do_cipher(ctx,out,in,inl);
  	}
  
-diff -up openssl-1.0.0f/crypto/evp/evp_locl.h.fips openssl-1.0.0f/crypto/evp/evp_locl.h
---- openssl-1.0.0f/crypto/evp/evp_locl.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp_locl.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_locl.h.fips openssl-1.0.0k/crypto/evp/evp_locl.h
+--- openssl-1.0.0k/crypto/evp/evp_locl.h.fips	2013-02-19 20:12:54.335659942 +0100
++++ openssl-1.0.0k/crypto/evp/evp_locl.h	2013-02-19 20:12:54.586664704 +0100
 @@ -254,14 +254,32 @@ const EVP_CIPHER *EVP_##cname##_ecb(void
  
  #define EVP_C_DATA(kstruct, ctx)	((kstruct *)(ctx)->cipher_data)
@@ -1568,33 +1565,33 @@ diff -up openssl-1.0.0f/crypto/evp/evp_locl.h.fips openssl-1.0.0f/crypto/evp/evp
  
  struct evp_pkey_ctx_st
  	{
-diff -up openssl-1.0.0f/crypto/evp/m_dss.c.fips openssl-1.0.0f/crypto/evp/m_dss.c
---- openssl-1.0.0f/crypto/evp/m_dss.c.fips	2006-04-19 19:05:57.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_dss.c	2012-01-05 13:22:30.000000000 +0100
-@@ -81,7 +81,7 @@ static const EVP_MD dsa_md=
- 	NID_dsaWithSHA,
- 	NID_dsaWithSHA,
+diff -up openssl-1.0.0k/crypto/evp/m_dss1.c.fips openssl-1.0.0k/crypto/evp/m_dss1.c
+--- openssl-1.0.0k/crypto/evp/m_dss1.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_dss1.c	2013-02-19 20:12:54.587664724 +0100
+@@ -82,7 +82,7 @@ static const EVP_MD dss1_md=
+ 	NID_dsa,
+ 	NID_dsaWithSHA1,
  	SHA_DIGEST_LENGTH,
 -	EVP_MD_FLAG_PKEY_DIGEST,
 +	EVP_MD_FLAG_PKEY_DIGEST|EVP_MD_FLAG_FIPS,
  	init,
  	update,
  	final,
-diff -up openssl-1.0.0f/crypto/evp/m_dss1.c.fips openssl-1.0.0f/crypto/evp/m_dss1.c
---- openssl-1.0.0f/crypto/evp/m_dss1.c.fips	2006-04-19 19:05:57.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_dss1.c	2012-01-05 13:22:30.000000000 +0100
-@@ -82,7 +82,7 @@ static const EVP_MD dss1_md=
- 	NID_dsa,
- 	NID_dsaWithSHA1,
+diff -up openssl-1.0.0k/crypto/evp/m_dss.c.fips openssl-1.0.0k/crypto/evp/m_dss.c
+--- openssl-1.0.0k/crypto/evp/m_dss.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_dss.c	2013-02-19 20:12:54.587664724 +0100
+@@ -81,7 +81,7 @@ static const EVP_MD dsa_md=
+ 	NID_dsaWithSHA,
+ 	NID_dsaWithSHA,
  	SHA_DIGEST_LENGTH,
 -	EVP_MD_FLAG_PKEY_DIGEST,
 +	EVP_MD_FLAG_PKEY_DIGEST|EVP_MD_FLAG_FIPS,
  	init,
  	update,
  	final,
-diff -up openssl-1.0.0f/crypto/evp/m_mdc2.c.fips openssl-1.0.0f/crypto/evp/m_mdc2.c
---- openssl-1.0.0f/crypto/evp/m_mdc2.c.fips	2010-02-02 14:36:05.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/m_mdc2.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/m_md2.c.fips openssl-1.0.0k/crypto/evp/m_md2.c
+--- openssl-1.0.0k/crypto/evp/m_md2.c.fips	2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_md2.c	2013-02-19 20:12:54.587664724 +0100
 @@ -68,6 +68,7 @@
  #ifndef OPENSSL_NO_RSA
  #include <openssl/rsa.h>
@@ -1602,10 +1599,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_mdc2.c.fips openssl-1.0.0f/crypto/evp/m_mdc
 +#include "evp_locl.h"
  
  static int init(EVP_MD_CTX *ctx)
- 	{ return MDC2_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_md2.c.fips openssl-1.0.0f/crypto/evp/m_md2.c
---- openssl-1.0.0f/crypto/evp/m_md2.c.fips	2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_md2.c	2012-01-05 13:22:30.000000000 +0100
+ 	{ return MD2_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_md4.c.fips openssl-1.0.0k/crypto/evp/m_md4.c
+--- openssl-1.0.0k/crypto/evp/m_md4.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_md4.c	2013-02-19 20:12:54.588664743 +0100
 @@ -68,6 +68,7 @@
  #ifndef OPENSSL_NO_RSA
  #include <openssl/rsa.h>
@@ -1613,10 +1610,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_md2.c.fips openssl-1.0.0f/crypto/evp/m_md2.
 +#include "evp_locl.h"
  
  static int init(EVP_MD_CTX *ctx)
- 	{ return MD2_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_md4.c.fips openssl-1.0.0f/crypto/evp/m_md4.c
---- openssl-1.0.0f/crypto/evp/m_md4.c.fips	2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_md4.c	2012-01-05 13:22:30.000000000 +0100
+ 	{ return MD4_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_md5.c.fips openssl-1.0.0k/crypto/evp/m_md5.c
+--- openssl-1.0.0k/crypto/evp/m_md5.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_md5.c	2013-02-19 20:12:54.588664743 +0100
 @@ -68,6 +68,7 @@
  #ifndef OPENSSL_NO_RSA
  #include <openssl/rsa.h>
@@ -1624,10 +1621,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_md4.c.fips openssl-1.0.0f/crypto/evp/m_md4.
 +#include "evp_locl.h"
  
  static int init(EVP_MD_CTX *ctx)
- 	{ return MD4_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_md5.c.fips openssl-1.0.0f/crypto/evp/m_md5.c
---- openssl-1.0.0f/crypto/evp/m_md5.c.fips	2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_md5.c	2012-01-05 13:22:30.000000000 +0100
+ 	{ return MD5_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_mdc2.c.fips openssl-1.0.0k/crypto/evp/m_mdc2.c
+--- openssl-1.0.0k/crypto/evp/m_mdc2.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_mdc2.c	2013-02-19 20:12:54.587664724 +0100
 @@ -68,6 +68,7 @@
  #ifndef OPENSSL_NO_RSA
  #include <openssl/rsa.h>
@@ -1635,10 +1632,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_md5.c.fips openssl-1.0.0f/crypto/evp/m_md5.
 +#include "evp_locl.h"
  
  static int init(EVP_MD_CTX *ctx)
- 	{ return MD5_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_ripemd.c.fips openssl-1.0.0f/crypto/evp/m_ripemd.c
---- openssl-1.0.0f/crypto/evp/m_ripemd.c.fips	2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_ripemd.c	2012-01-05 13:22:30.000000000 +0100
+ 	{ return MDC2_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_ripemd.c.fips openssl-1.0.0k/crypto/evp/m_ripemd.c
+--- openssl-1.0.0k/crypto/evp/m_ripemd.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_ripemd.c	2013-02-19 20:12:54.588664743 +0100
 @@ -68,6 +68,7 @@
  #ifndef OPENSSL_NO_RSA
  #include <openssl/rsa.h>
@@ -1647,9 +1644,9 @@ diff -up openssl-1.0.0f/crypto/evp/m_ripemd.c.fips openssl-1.0.0f/crypto/evp/m_r
  
  static int init(EVP_MD_CTX *ctx)
  	{ return RIPEMD160_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_sha1.c.fips openssl-1.0.0f/crypto/evp/m_sha1.c
---- openssl-1.0.0f/crypto/evp/m_sha1.c.fips	2008-03-12 22:14:24.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/m_sha1.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/m_sha1.c.fips openssl-1.0.0k/crypto/evp/m_sha1.c
+--- openssl-1.0.0k/crypto/evp/m_sha1.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_sha1.c	2013-02-19 20:12:54.589664762 +0100
 @@ -82,7 +82,8 @@ static const EVP_MD sha1_md=
  	NID_sha1,
  	NID_sha1WithRSAEncryption,
@@ -1700,9 +1697,9 @@ diff -up openssl-1.0.0f/crypto/evp/m_sha1.c.fips openssl-1.0.0f/crypto/evp/m_sha
  	init512,
  	update512,
  	final512,
-diff -up openssl-1.0.0f/crypto/evp/m_wp.c.fips openssl-1.0.0f/crypto/evp/m_wp.c
---- openssl-1.0.0f/crypto/evp/m_wp.c.fips	2005-11-30 21:57:23.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/m_wp.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/m_wp.c.fips openssl-1.0.0k/crypto/evp/m_wp.c
+--- openssl-1.0.0k/crypto/evp/m_wp.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_wp.c	2013-02-19 20:12:54.589664762 +0100
 @@ -9,6 +9,7 @@
  #include <openssl/objects.h>
  #include <openssl/x509.h>
@@ -1711,9 +1708,9 @@ diff -up openssl-1.0.0f/crypto/evp/m_wp.c.fips openssl-1.0.0f/crypto/evp/m_wp.c
  
  static int init(EVP_MD_CTX *ctx)
  	{ return WHIRLPOOL_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/names.c.fips openssl-1.0.0f/crypto/evp/names.c
---- openssl-1.0.0f/crypto/evp/names.c.fips	2010-03-06 21:47:45.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/names.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/names.c.fips openssl-1.0.0k/crypto/evp/names.c
+--- openssl-1.0.0k/crypto/evp/names.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/names.c	2013-02-19 20:12:54.589664762 +0100
 @@ -66,6 +66,10 @@ int EVP_add_cipher(const EVP_CIPHER *c)
  	{
  	int r;
@@ -1736,9 +1733,9 @@ diff -up openssl-1.0.0f/crypto/evp/names.c.fips openssl-1.0.0f/crypto/evp/names.
  	name=OBJ_nid2sn(md->type);
  	r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md);
  	if (r == 0) return(0);
-diff -up openssl-1.0.0f/crypto/evp/p_sign.c.fips openssl-1.0.0f/crypto/evp/p_sign.c
---- openssl-1.0.0f/crypto/evp/p_sign.c.fips	2010-11-27 18:34:57.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/p_sign.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/p_sign.c.fips openssl-1.0.0k/crypto/evp/p_sign.c
+--- openssl-1.0.0k/crypto/evp/p_sign.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/p_sign.c	2013-02-19 20:12:54.589664762 +0100
 @@ -61,6 +61,7 @@
  #include <openssl/evp.h>
  #include <openssl/objects.h>
@@ -1770,9 +1767,9 @@ diff -up openssl-1.0.0f/crypto/evp/p_sign.c.fips openssl-1.0.0f/crypto/evp/p_sig
  		if (EVP_PKEY_sign(pkctx, sigret, &sltmp, m, m_len) <= 0)
  			goto err;
  		*siglen = sltmp;
-diff -up openssl-1.0.0f/crypto/evp/p_verify.c.fips openssl-1.0.0f/crypto/evp/p_verify.c
---- openssl-1.0.0f/crypto/evp/p_verify.c.fips	2010-11-27 18:34:57.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/p_verify.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/p_verify.c.fips openssl-1.0.0k/crypto/evp/p_verify.c
+--- openssl-1.0.0k/crypto/evp/p_verify.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/p_verify.c	2013-02-19 20:12:54.590664781 +0100
 @@ -61,6 +61,7 @@
  #include <openssl/evp.h>
  #include <openssl/objects.h>
@@ -1804,9 +1801,9 @@ diff -up openssl-1.0.0f/crypto/evp/p_verify.c.fips openssl-1.0.0f/crypto/evp/p_v
  		i = EVP_PKEY_verify(pkctx, sigbuf, siglen, m, m_len);
  		err:
  		EVP_PKEY_CTX_free(pkctx);
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c.fips	2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c	2013-02-19 20:12:54.591664800 +0100
 @@ -0,0 +1,939 @@
 +/* ====================================================================
 + * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
@@ -2747,9 +2744,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c.fips openssl-1.0.0f/crypt
 +    }
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c.fips	2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c	2013-02-19 20:12:54.591664800 +0100
 @@ -0,0 +1,702 @@
 +/* ====================================================================
 + * Copyright (c) 2004 The OpenSSL Project.  All rights reserved.
@@ -3453,9 +3450,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c.fips openssl-1.0.0f/cryp
 +    }
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c.fips	2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c	2013-02-19 20:12:54.591664800 +0100
 @@ -0,0 +1,537 @@
 +#include <openssl/opensslconf.h>
 +
@@ -3994,9 +3991,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c.fips openssl-1.0.0f/crypto
 +    }
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c.fips	2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c	2013-02-19 20:12:54.591664800 +0100
 @@ -0,0 +1,230 @@
 +/*
 + * Crude test driver for processing the VST and MCT testvector files
@@ -4228,9 +4225,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c.fips openssl-1.0.0f/crypto
 +    return 0;
 +    }
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c.fips	2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c	2013-02-19 20:12:54.592664819 +0100
 @@ -0,0 +1,390 @@
 +/* fips_rsagtest.c */
 +/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -4622,9 +4619,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c.fips openssl-1.0.0f/cry
 +	}
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c.fips	2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c	2013-02-19 20:12:54.592664819 +0100
 @@ -0,0 +1,370 @@
 +/* fips_rsastest.c */
 +/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -4996,9 +4993,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c.fips openssl-1.0.0f/cry
 +	return ret;
 +	}
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c.fips	2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c	2013-02-19 20:12:54.592664819 +0100
 @@ -0,0 +1,377 @@
 +/* fips_rsavtest.c */
 +/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -5377,9 +5374,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c.fips openssl-1.0.0f/cry
 +	return ret;
 +	}
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c.fips	2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c	2013-02-19 20:12:54.592664819 +0100
 @@ -0,0 +1,388 @@
 +/* fips_shatest.c */
 +/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -5769,9 +5766,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c.fips openssl-1.0.0f/cryp
 +	}
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_utl.h.fips openssl-1.0.0f/crypto/fips/cavs/fips_utl.h
---- openssl-1.0.0f/crypto/fips/cavs/fips_utl.h.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_utl.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_utl.h.fips openssl-1.0.0k/crypto/fips/cavs/fips_utl.h
+--- openssl-1.0.0k/crypto/fips/cavs/fips_utl.h.fips	2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_utl.h	2013-02-19 20:12:54.593664838 +0100
 @@ -0,0 +1,343 @@
 +/* ====================================================================
 + * Copyright (c) 2007 The OpenSSL Project.  All rights reserved.
@@ -6116,9 +6113,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_utl.h.fips openssl-1.0.0f/crypto/f
 +#endif
 +    }
 +
-diff -up openssl-1.0.0f/crypto/fips_err.c.fips openssl-1.0.0f/crypto/fips_err.c
---- openssl-1.0.0f/crypto/fips_err.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips_err.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips_err.c.fips openssl-1.0.0k/crypto/fips_err.c
+--- openssl-1.0.0k/crypto/fips_err.c.fips	2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips_err.c	2013-02-19 20:12:54.593664838 +0100
 @@ -0,0 +1,7 @@
 +#include <openssl/opensslconf.h>
 +
@@ -6127,9 +6124,9 @@ diff -up openssl-1.0.0f/crypto/fips_err.c.fips openssl-1.0.0f/crypto/fips_err.c
 +#else
 +static void *dummy=&dummy;
 +#endif
-diff -up openssl-1.0.0f/crypto/fips_err.h.fips openssl-1.0.0f/crypto/fips_err.h
---- openssl-1.0.0f/crypto/fips_err.h.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips_err.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips_err.h.fips openssl-1.0.0k/crypto/fips_err.h
+--- openssl-1.0.0k/crypto/fips_err.h.fips	2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips_err.h	2013-02-19 20:12:54.593664838 +0100
 @@ -0,0 +1,137 @@
 +/* crypto/fips_err.h */
 +/* ====================================================================
@@ -6268,9 +6265,9 @@ diff -up openssl-1.0.0f/crypto/fips_err.h.fips openssl-1.0.0f/crypto/fips_err.h
 +		}
 +#endif
 +	}
-diff -up openssl-1.0.0f/crypto/fips/fips_aes_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_aes_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_aes_selftest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_aes_selftest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_aes_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_aes_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_aes_selftest.c.fips	2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips/fips_aes_selftest.c	2013-02-19 20:12:54.593664838 +0100
 @@ -0,0 +1,103 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -6375,9 +6372,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_aes_selftest.c.fips openssl-1.0.0f/cryp
 +    return ret;
 +    }
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips.c.fips openssl-1.0.0f/crypto/fips/fips.c
---- openssl-1.0.0f/crypto/fips/fips.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips.c.fips openssl-1.0.0k/crypto/fips/fips.c
+--- openssl-1.0.0k/crypto/fips/fips.c.fips	2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips/fips.c	2013-02-19 20:12:54.593664838 +0100
 @@ -0,0 +1,419 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -6798,9 +6795,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips.c.fips openssl-1.0.0f/crypto/fips/fips.
 +
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_des_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_des_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_des_selftest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_des_selftest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_des_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_des_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_des_selftest.c.fips	2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_des_selftest.c	2013-02-19 20:12:54.594664857 +0100
 @@ -0,0 +1,139 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -6941,9 +6938,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_des_selftest.c.fips openssl-1.0.0f/cryp
 +    return ret;
 +    }
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c.fips	2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c	2013-02-19 20:12:54.594664857 +0100
 @@ -0,0 +1,186 @@
 +/* crypto/dsa/dsatest.c */
 +/* Copyright (C) 1995-1998 Eric Young (eay at cryptsoft.com)
@@ -7131,9 +7128,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c.fips openssl-1.0.0f/cryp
 +    return ret;
 +    }
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips.h.fips openssl-1.0.0f/crypto/fips/fips.h
---- openssl-1.0.0f/crypto/fips/fips.h.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips.h.fips openssl-1.0.0k/crypto/fips/fips.h
+--- openssl-1.0.0k/crypto/fips/fips.h.fips	2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips.h	2013-02-19 20:12:54.594664857 +0100
 @@ -0,0 +1,163 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -7298,9 +7295,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips.h.fips openssl-1.0.0f/crypto/fips/fips.
 +}
 +#endif
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c.fips	2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c	2013-02-19 20:12:54.594664857 +0100
 @@ -0,0 +1,137 @@
 +/* ====================================================================
 + * Copyright (c) 2005 The OpenSSL Project.  All rights reserved.
@@ -7439,9 +7436,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c.fips openssl-1.0.0f/cry
 +    return 1;
 +    }
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rand.c.fips openssl-1.0.0f/crypto/fips/fips_rand.c
---- openssl-1.0.0f/crypto/fips/fips_rand.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rand.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rand.c.fips openssl-1.0.0k/crypto/fips/fips_rand.c
+--- openssl-1.0.0k/crypto/fips/fips_rand.c.fips	2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rand.c	2013-02-19 20:12:54.594664857 +0100
 @@ -0,0 +1,412 @@
 +/* ====================================================================
 + * Copyright (c) 2007 The OpenSSL Project.  All rights reserved.
@@ -7855,9 +7852,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rand.c.fips openssl-1.0.0f/crypto/fips/
 +}
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rand.h.fips openssl-1.0.0f/crypto/fips/fips_rand.h
---- openssl-1.0.0f/crypto/fips/fips_rand.h.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rand.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rand.h.fips openssl-1.0.0k/crypto/fips/fips_rand.h
+--- openssl-1.0.0k/crypto/fips/fips_rand.h.fips	2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rand.h	2013-02-19 20:12:54.595664876 +0100
 @@ -0,0 +1,77 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -7936,9 +7933,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rand.h.fips openssl-1.0.0f/crypto/fips/
 +#endif
 +#endif
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rand_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_rand_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_rand_selftest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rand_selftest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rand_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_rand_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_rand_selftest.c.fips	2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rand_selftest.c	2013-02-19 20:12:54.595664876 +0100
 @@ -0,0 +1,373 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -8313,9 +8310,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rand_selftest.c.fips openssl-1.0.0f/cry
 +	}
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_randtest.c.fips openssl-1.0.0f/crypto/fips/fips_randtest.c
---- openssl-1.0.0f/crypto/fips/fips_randtest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_randtest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_randtest.c.fips openssl-1.0.0k/crypto/fips/fips_randtest.c
+--- openssl-1.0.0k/crypto/fips/fips_randtest.c.fips	2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_randtest.c	2013-02-19 20:12:54.595664876 +0100
 @@ -0,0 +1,248 @@
 +/* Copyright (C) 1995-1998 Eric Young (eay at cryptsoft.com)
 + * All rights reserved.
@@ -8565,9 +8562,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_randtest.c.fips openssl-1.0.0f/crypto/f
 +	}
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c.fips	2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c	2013-02-19 20:12:54.595664876 +0100
 @@ -0,0 +1,441 @@
 +/* ====================================================================
 + * Copyright (c) 2003-2007 The OpenSSL Project.  All rights reserved.
@@ -9010,9 +9007,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c.fips openssl-1.0.0f/cryp
 +	}
 +
 +#endif /* def OPENSSL_FIPS */
-diff -up openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c.fips openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c
---- openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c.fips openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c
+--- openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c.fips	2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c	2013-02-19 20:12:54.596664895 +0100
 @@ -0,0 +1,281 @@
 +/* crypto/rsa/rsa_gen.c */
 +/* Copyright (C) 1995-1998 Eric Young (eay at cryptsoft.com)
@@ -9295,9 +9292,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c.fips openssl-1.0.0f/crypto/
 +	return 0;
 +
 +	}
-diff -up openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c.fips	2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c	2013-02-19 20:12:54.596664895 +0100
 @@ -0,0 +1,99 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -9398,9 +9395,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c.fips openssl-1.0.0f/cry
 +    }
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c.fips openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c
---- openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.fips openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c
+--- openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.fips	2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c	2013-02-19 20:12:54.596664895 +0100
 @@ -0,0 +1,173 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -9575,9 +9572,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c.fips openssl-1.0.0f/c
 +    }
 +
 +
-diff -up openssl-1.0.0f/crypto/fips/fips_test_suite.c.fips openssl-1.0.0f/crypto/fips/fips_test_suite.c
---- openssl-1.0.0f/crypto/fips/fips_test_suite.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_test_suite.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_test_suite.c.fips openssl-1.0.0k/crypto/fips/fips_test_suite.c
+--- openssl-1.0.0k/crypto/fips/fips_test_suite.c.fips	2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_test_suite.c	2013-02-19 20:12:54.596664895 +0100
 @@ -0,0 +1,588 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -10167,9 +10164,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_test_suite.c.fips openssl-1.0.0f/crypto
 +    }
 +
 +#endif
-diff -up openssl-1.0.0f/crypto/fips_locl.h.fips openssl-1.0.0f/crypto/fips_locl.h
---- openssl-1.0.0f/crypto/fips_locl.h.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips_locl.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips_locl.h.fips openssl-1.0.0k/crypto/fips_locl.h
+--- openssl-1.0.0k/crypto/fips_locl.h.fips	2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips_locl.h	2013-02-19 20:12:54.596664895 +0100
 @@ -0,0 +1,72 @@
 +/* ====================================================================
 + * Copyright (c) 2003 The OpenSSL Project.  All rights reserved.
@@ -10243,9 +10240,9 @@ diff -up openssl-1.0.0f/crypto/fips_locl.h.fips openssl-1.0.0f/crypto/fips_locl.
 +}
 +#endif
 +#endif
-diff -up openssl-1.0.0f/crypto/fips/Makefile.fips openssl-1.0.0f/crypto/fips/Makefile
---- openssl-1.0.0f/crypto/fips/Makefile.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/Makefile	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/Makefile.fips openssl-1.0.0k/crypto/fips/Makefile
+--- openssl-1.0.0k/crypto/fips/Makefile.fips	2013-02-19 20:12:54.597664913 +0100
++++ openssl-1.0.0k/crypto/fips/Makefile	2013-02-19 20:12:54.597664913 +0100
 @@ -0,0 +1,81 @@
 +#
 +# OpenSSL/crypto/fips/Makefile
@@ -10328,9 +10325,9 @@ diff -up openssl-1.0.0f/crypto/fips/Makefile.fips openssl-1.0.0f/crypto/fips/Mak
 +
 +# DO NOT DELETE THIS LINE -- make depend depends on it.
 +
-diff -up openssl-1.0.0f/crypto/hmac/hmac.c.fips openssl-1.0.0f/crypto/hmac/hmac.c
---- openssl-1.0.0f/crypto/hmac/hmac.c.fips	2010-06-15 19:25:09.000000000 +0200
-+++ openssl-1.0.0f/crypto/hmac/hmac.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/hmac/hmac.c.fips openssl-1.0.0k/crypto/hmac/hmac.c
+--- openssl-1.0.0k/crypto/hmac/hmac.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/hmac/hmac.c	2013-02-19 20:12:54.597664913 +0100
 @@ -77,6 +77,13 @@ int HMAC_Init_ex(HMAC_CTX *ctx, const vo
  
  	if (key != NULL)
@@ -10345,9 +10342,9 @@ diff -up openssl-1.0.0f/crypto/hmac/hmac.c.fips openssl-1.0.0f/crypto/hmac/hmac.
  		reset=1;
  		j=EVP_MD_block_size(md);
  		OPENSSL_assert(j <= (int)sizeof(ctx->key));
-diff -up openssl-1.0.0f/crypto/Makefile.fips openssl-1.0.0f/crypto/Makefile
---- openssl-1.0.0f/crypto/Makefile.fips	2010-07-27 00:09:59.000000000 +0200
-+++ openssl-1.0.0f/crypto/Makefile	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/Makefile.fips openssl-1.0.0k/crypto/Makefile
+--- openssl-1.0.0k/crypto/Makefile.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/Makefile	2013-02-19 20:12:54.597664913 +0100
 @@ -34,14 +34,14 @@ GENERAL=Makefile README crypto-lib.com i
  
  LIB= $(TOP)/libcrypto.a
@@ -10366,47 +10363,9 @@ diff -up openssl-1.0.0f/crypto/Makefile.fips openssl-1.0.0f/crypto/Makefile
  
  ALL=    $(GENERAL) $(SRC) $(HEADER)
  
-diff -up openssl-1.0.0f/crypto/mdc2/mdc2dgst.c.fips openssl-1.0.0f/crypto/mdc2/mdc2dgst.c
---- openssl-1.0.0f/crypto/mdc2/mdc2dgst.c.fips	2004-07-25 21:10:41.000000000 +0200
-+++ openssl-1.0.0f/crypto/mdc2/mdc2dgst.c	2012-01-05 13:22:30.000000000 +0100
-@@ -61,6 +61,11 @@
- #include <string.h>
- #include <openssl/des.h>
- #include <openssl/mdc2.h>
-+#include <openssl/err.h>
-+#ifdef OPENSSL_FIPS
-+#include <openssl/fips.h>
-+#endif
-+
- 
- #undef c2l
- #define c2l(c,l)	(l =((DES_LONG)(*((c)++)))    , \
-@@ -75,7 +80,7 @@
- 			*((c)++)=(unsigned char)(((l)>>24L)&0xff))
- 
- static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
--int MDC2_Init(MDC2_CTX *c)
-+FIPS_NON_FIPS_MD_Init(MDC2)
- 	{
- 	c->num=0;
- 	c->pad_type=1;
-diff -up openssl-1.0.0f/crypto/mdc2/mdc2.h.fips openssl-1.0.0f/crypto/mdc2/mdc2.h
---- openssl-1.0.0f/crypto/mdc2/mdc2.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/mdc2/mdc2.h	2012-01-05 13:22:30.000000000 +0100
-@@ -80,7 +80,9 @@ typedef struct mdc2_ctx_st
- 	int pad_type; /* either 1 or 2, default 1 */
- 	} MDC2_CTX;
- 
--
-+#ifdef OPENSSL_FIPS
-+int private_MDC2_Init(MDC2_CTX *c);
-+#endif
- int MDC2_Init(MDC2_CTX *c);
- int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
- int MDC2_Final(unsigned char *md, MDC2_CTX *c);
-diff -up openssl-1.0.0f/crypto/md2/md2_dgst.c.fips openssl-1.0.0f/crypto/md2/md2_dgst.c
---- openssl-1.0.0f/crypto/md2/md2_dgst.c.fips	2007-08-31 12:12:35.000000000 +0200
-+++ openssl-1.0.0f/crypto/md2/md2_dgst.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md2/md2_dgst.c.fips openssl-1.0.0k/crypto/md2/md2_dgst.c
+--- openssl-1.0.0k/crypto/md2/md2_dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/md2/md2_dgst.c	2013-02-19 20:12:54.597664913 +0100
 @@ -62,6 +62,11 @@
  #include <openssl/md2.h>
  #include <openssl/opensslv.h>
@@ -10428,9 +10387,9 @@ diff -up openssl-1.0.0f/crypto/md2/md2_dgst.c.fips openssl-1.0.0f/crypto/md2/md2
  	{
  	c->num=0;
  	memset(c->state,0,sizeof c->state);
-diff -up openssl-1.0.0f/crypto/md2/md2.h.fips openssl-1.0.0f/crypto/md2/md2.h
---- openssl-1.0.0f/crypto/md2/md2.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/md2/md2.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md2/md2.h.fips openssl-1.0.0k/crypto/md2/md2.h
+--- openssl-1.0.0k/crypto/md2/md2.h.fips	2013-02-19 20:12:54.348660189 +0100
++++ openssl-1.0.0k/crypto/md2/md2.h	2013-02-19 20:12:54.597664913 +0100
 @@ -81,6 +81,9 @@ typedef struct MD2state_st
  	} MD2_CTX;
  
@@ -10441,9 +10400,9 @@ diff -up openssl-1.0.0f/crypto/md2/md2.h.fips openssl-1.0.0f/crypto/md2/md2.h
  int MD2_Init(MD2_CTX *c);
  int MD2_Update(MD2_CTX *c, const unsigned char *data, size_t len);
  int MD2_Final(unsigned char *md, MD2_CTX *c);
-diff -up openssl-1.0.0f/crypto/md4/md4_dgst.c.fips openssl-1.0.0f/crypto/md4/md4_dgst.c
---- openssl-1.0.0f/crypto/md4/md4_dgst.c.fips	2007-01-21 14:07:11.000000000 +0100
-+++ openssl-1.0.0f/crypto/md4/md4_dgst.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md4/md4_dgst.c.fips openssl-1.0.0k/crypto/md4/md4_dgst.c
+--- openssl-1.0.0k/crypto/md4/md4_dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/md4/md4_dgst.c	2013-02-19 20:12:54.598664931 +0100
 @@ -59,6 +59,11 @@
  #include <stdio.h>
  #include "md4_locl.h"
@@ -10465,9 +10424,9 @@ diff -up openssl-1.0.0f/crypto/md4/md4_dgst.c.fips openssl-1.0.0f/crypto/md4/md4
  	{
  	memset (c,0,sizeof(*c));
  	c->A=INIT_DATA_A;
-diff -up openssl-1.0.0f/crypto/md4/md4.h.fips openssl-1.0.0f/crypto/md4/md4.h
---- openssl-1.0.0f/crypto/md4/md4.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/md4/md4.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md4/md4.h.fips openssl-1.0.0k/crypto/md4/md4.h
+--- openssl-1.0.0k/crypto/md4/md4.h.fips	2013-02-19 20:12:54.268658671 +0100
++++ openssl-1.0.0k/crypto/md4/md4.h	2013-02-19 20:12:54.598664931 +0100
 @@ -105,6 +105,9 @@ typedef struct MD4state_st
  	unsigned int num;
  	} MD4_CTX;
@@ -10478,9 +10437,9 @@ diff -up openssl-1.0.0f/crypto/md4/md4.h.fips openssl-1.0.0f/crypto/md4/md4.h
  int MD4_Init(MD4_CTX *c);
  int MD4_Update(MD4_CTX *c, const void *data, size_t len);
  int MD4_Final(unsigned char *md, MD4_CTX *c);
-diff -up openssl-1.0.0f/crypto/md5/md5_dgst.c.fips openssl-1.0.0f/crypto/md5/md5_dgst.c
---- openssl-1.0.0f/crypto/md5/md5_dgst.c.fips	2007-01-21 14:07:11.000000000 +0100
-+++ openssl-1.0.0f/crypto/md5/md5_dgst.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md5/md5_dgst.c.fips openssl-1.0.0k/crypto/md5/md5_dgst.c
+--- openssl-1.0.0k/crypto/md5/md5_dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/md5/md5_dgst.c	2013-02-19 20:12:54.598664931 +0100
 @@ -59,6 +59,11 @@
  #include <stdio.h>
  #include "md5_locl.h"
@@ -10502,9 +10461,9 @@ diff -up openssl-1.0.0f/crypto/md5/md5_dgst.c.fips openssl-1.0.0f/crypto/md5/md5
  	{
  	memset (c,0,sizeof(*c));
  	c->A=INIT_DATA_A;
-diff -up openssl-1.0.0f/crypto/md5/md5.h.fips openssl-1.0.0f/crypto/md5/md5.h
---- openssl-1.0.0f/crypto/md5/md5.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/md5/md5.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md5/md5.h.fips openssl-1.0.0k/crypto/md5/md5.h
+--- openssl-1.0.0k/crypto/md5/md5.h.fips	2013-02-19 20:12:54.012653813 +0100
++++ openssl-1.0.0k/crypto/md5/md5.h	2013-02-19 20:12:54.598664931 +0100
 @@ -105,6 +105,9 @@ typedef struct MD5state_st
  	unsigned int num;
  	} MD5_CTX;
@@ -10515,10 +10474,48 @@ diff -up openssl-1.0.0f/crypto/md5/md5.h.fips openssl-1.0.0f/crypto/md5/md5.h
  int MD5_Init(MD5_CTX *c);
  int MD5_Update(MD5_CTX *c, const void *data, size_t len);
  int MD5_Final(unsigned char *md, MD5_CTX *c);
-diff -up openssl-1.0.0f/crypto/mem.c.fips openssl-1.0.0f/crypto/mem.c
---- openssl-1.0.0f/crypto/mem.c.fips	2008-11-12 04:57:47.000000000 +0100
-+++ openssl-1.0.0f/crypto/mem.c	2012-01-05 13:22:30.000000000 +0100
-@@ -101,7 +101,7 @@ static void (*free_locked_func)(void *) 
+diff -up openssl-1.0.0k/crypto/mdc2/mdc2dgst.c.fips openssl-1.0.0k/crypto/mdc2/mdc2dgst.c
+--- openssl-1.0.0k/crypto/mdc2/mdc2dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/mdc2/mdc2dgst.c	2013-02-19 20:12:54.597664913 +0100
+@@ -61,6 +61,11 @@
+ #include <string.h>
+ #include <openssl/des.h>
+ #include <openssl/mdc2.h>
++#include <openssl/err.h>
++#ifdef OPENSSL_FIPS
++#include <openssl/fips.h>
++#endif
++
+ 
+ #undef c2l
+ #define c2l(c,l)	(l =((DES_LONG)(*((c)++)))    , \
+@@ -75,7 +80,7 @@
+ 			*((c)++)=(unsigned char)(((l)>>24L)&0xff))
+ 
+ static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
+-int MDC2_Init(MDC2_CTX *c)
++FIPS_NON_FIPS_MD_Init(MDC2)
+ 	{
+ 	c->num=0;
+ 	c->pad_type=1;
+diff -up openssl-1.0.0k/crypto/mdc2/mdc2.h.fips openssl-1.0.0k/crypto/mdc2/mdc2.h
+--- openssl-1.0.0k/crypto/mdc2/mdc2.h.fips	2013-02-19 20:12:54.061654741 +0100
++++ openssl-1.0.0k/crypto/mdc2/mdc2.h	2013-02-19 20:12:54.597664913 +0100
+@@ -80,7 +80,9 @@ typedef struct mdc2_ctx_st
+ 	int pad_type; /* either 1 or 2, default 1 */
+ 	} MDC2_CTX;
+ 
+-
++#ifdef OPENSSL_FIPS
++int private_MDC2_Init(MDC2_CTX *c);
++#endif
+ int MDC2_Init(MDC2_CTX *c);
+ int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
+ int MDC2_Final(unsigned char *md, MDC2_CTX *c);
+diff -up openssl-1.0.0k/crypto/mem.c.fips openssl-1.0.0k/crypto/mem.c
+--- openssl-1.0.0k/crypto/mem.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/mem.c	2013-02-19 20:12:54.598664931 +0100
+@@ -101,7 +101,7 @@ static void (*free_locked_func)(void *)
  
  /* may be changed as long as 'allow_customize_debug' is set */
  /* XXX use correct function pointer types */
@@ -10527,9 +10524,9 @@ diff -up openssl-1.0.0f/crypto/mem.c.fips openssl-1.0.0f/crypto/mem.c
  /* use default functions from mem_dbg.c */
  static void (*malloc_debug_func)(void *,int,const char *,int,int)
  	= CRYPTO_dbg_malloc;
-diff -up openssl-1.0.0f/crypto/o_init.c.fips openssl-1.0.0f/crypto/o_init.c
---- openssl-1.0.0f/crypto/o_init.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/o_init.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/o_init.c.fips openssl-1.0.0k/crypto/o_init.c
+--- openssl-1.0.0k/crypto/o_init.c.fips	2013-02-19 20:12:54.598664931 +0100
++++ openssl-1.0.0k/crypto/o_init.c	2013-02-19 20:12:54.598664931 +0100
 @@ -0,0 +1,80 @@
 +/* o_init.c */
 +/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -10611,9 +10608,9 @@ diff -up openssl-1.0.0f/crypto/o_init.c.fips openssl-1.0.0f/crypto/o_init.c
 +	}
 +		
 +
-diff -up openssl-1.0.0f/crypto/opensslconf.h.in.fips openssl-1.0.0f/crypto/opensslconf.h.in
---- openssl-1.0.0f/crypto/opensslconf.h.in.fips	2005-12-16 11:37:23.000000000 +0100
-+++ openssl-1.0.0f/crypto/opensslconf.h.in	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/opensslconf.h.in.fips openssl-1.0.0k/crypto/opensslconf.h.in
+--- openssl-1.0.0k/crypto/opensslconf.h.in.fips	2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/opensslconf.h.in	2013-02-19 20:12:54.599664950 +0100
 @@ -1,5 +1,20 @@
  /* crypto/opensslconf.h.in */
  
@@ -10635,9 +10632,9 @@ diff -up openssl-1.0.0f/crypto/opensslconf.h.in.fips openssl-1.0.0f/crypto/opens
  /* Generate 80386 code? */
  #undef I386_ONLY
  
-diff -up openssl-1.0.0f/crypto/pkcs12/p12_crt.c.fips openssl-1.0.0f/crypto/pkcs12/p12_crt.c
---- openssl-1.0.0f/crypto/pkcs12/p12_crt.c.fips	2009-03-09 14:08:04.000000000 +0100
-+++ openssl-1.0.0f/crypto/pkcs12/p12_crt.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/pkcs12/p12_crt.c.fips openssl-1.0.0k/crypto/pkcs12/p12_crt.c
+--- openssl-1.0.0k/crypto/pkcs12/p12_crt.c.fips	2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/pkcs12/p12_crt.c	2013-02-19 20:12:54.599664950 +0100
 @@ -59,6 +59,10 @@
  #include <stdio.h>
  #include "cryptlib.h"
@@ -10664,9 +10661,9 @@ diff -up openssl-1.0.0f/crypto/pkcs12/p12_crt.c.fips openssl-1.0.0f/crypto/pkcs1
  	if (!nid_key)
  		nid_key = NID_pbe_WithSHA1And3_Key_TripleDES_CBC;
  	if (!iter)
-diff -up openssl-1.0.0f/crypto/rand/md_rand.c.fips openssl-1.0.0f/crypto/rand/md_rand.c
---- openssl-1.0.0f/crypto/rand/md_rand.c.fips	2010-06-16 15:17:22.000000000 +0200
-+++ openssl-1.0.0f/crypto/rand/md_rand.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/md_rand.c.fips openssl-1.0.0k/crypto/rand/md_rand.c
+--- openssl-1.0.0k/crypto/rand/md_rand.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/md_rand.c	2013-02-19 20:12:54.599664950 +0100
 @@ -126,6 +126,10 @@
  
  #include <openssl/crypto.h>
@@ -10693,9 +10690,9 @@ diff -up openssl-1.0.0f/crypto/rand/md_rand.c.fips openssl-1.0.0f/crypto/rand/md
  #ifdef PREDICT
  	if (rand_predictable)
  		{
-diff -up openssl-1.0.0f/crypto/rand/rand_err.c.fips openssl-1.0.0f/crypto/rand/rand_err.c
---- openssl-1.0.0f/crypto/rand/rand_err.c.fips	2006-11-21 22:29:41.000000000 +0100
-+++ openssl-1.0.0f/crypto/rand/rand_err.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/rand_err.c.fips openssl-1.0.0k/crypto/rand/rand_err.c
+--- openssl-1.0.0k/crypto/rand/rand_err.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/rand_err.c	2013-02-19 20:12:54.599664950 +0100
 @@ -70,6 +70,13 @@
  
  static ERR_STRING_DATA RAND_str_functs[]=
@@ -10728,9 +10725,9 @@ diff -up openssl-1.0.0f/crypto/rand/rand_err.c.fips openssl-1.0.0f/crypto/rand/r
  {0,NULL}
  	};
  
-diff -up openssl-1.0.0f/crypto/rand/rand.h.fips openssl-1.0.0f/crypto/rand/rand.h
---- openssl-1.0.0f/crypto/rand/rand.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rand/rand.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/rand.h.fips openssl-1.0.0k/crypto/rand/rand.h
+--- openssl-1.0.0k/crypto/rand/rand.h.fips	2013-02-19 20:12:54.071654932 +0100
++++ openssl-1.0.0k/crypto/rand/rand.h	2013-02-19 20:12:54.599664950 +0100
 @@ -128,11 +128,28 @@ void ERR_load_RAND_strings(void);
  /* Error codes for the RAND functions. */
  
@@ -10760,9 +10757,9 @@ diff -up openssl-1.0.0f/crypto/rand/rand.h.fips openssl-1.0.0f/crypto/rand/rand.
  
  #ifdef  __cplusplus
  }
-diff -up openssl-1.0.0f/crypto/rand/rand_lib.c.fips openssl-1.0.0f/crypto/rand/rand_lib.c
---- openssl-1.0.0f/crypto/rand/rand_lib.c.fips	2008-11-12 04:58:04.000000000 +0100
-+++ openssl-1.0.0f/crypto/rand/rand_lib.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/rand_lib.c.fips openssl-1.0.0k/crypto/rand/rand_lib.c
+--- openssl-1.0.0k/crypto/rand/rand_lib.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/rand_lib.c	2013-02-19 20:12:54.599664950 +0100
 @@ -60,6 +60,12 @@
  #include <time.h>
  #include "cryptlib.h"
@@ -10796,9 +10793,9 @@ diff -up openssl-1.0.0f/crypto/rand/rand_lib.c.fips openssl-1.0.0f/crypto/rand/r
  	return default_RAND_meth;
  	}
  
-diff -up openssl-1.0.0f/crypto/rc2/rc2.h.fips openssl-1.0.0f/crypto/rc2/rc2.h
---- openssl-1.0.0f/crypto/rc2/rc2.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc2/rc2.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc2/rc2.h.fips openssl-1.0.0k/crypto/rc2/rc2.h
+--- openssl-1.0.0k/crypto/rc2/rc2.h.fips	2013-02-19 20:12:54.216657683 +0100
++++ openssl-1.0.0k/crypto/rc2/rc2.h	2013-02-19 20:12:54.599664950 +0100
 @@ -79,7 +79,9 @@ typedef struct rc2_key_st
  	RC2_INT data[64];
  	} RC2_KEY;
@@ -10810,9 +10807,9 @@ diff -up openssl-1.0.0f/crypto/rc2/rc2.h.fips openssl-1.0.0f/crypto/rc2/rc2.h
  void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
  void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
  		     int enc);
-diff -up openssl-1.0.0f/crypto/rc2/rc2_skey.c.fips openssl-1.0.0f/crypto/rc2/rc2_skey.c
---- openssl-1.0.0f/crypto/rc2/rc2_skey.c.fips	2007-09-18 23:10:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/rc2/rc2_skey.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc2/rc2_skey.c.fips openssl-1.0.0k/crypto/rc2/rc2_skey.c
+--- openssl-1.0.0k/crypto/rc2/rc2_skey.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc2/rc2_skey.c	2013-02-19 20:12:54.600664970 +0100
 @@ -57,6 +57,11 @@
   */
  
@@ -10846,31 +10843,9 @@ diff -up openssl-1.0.0f/crypto/rc2/rc2_skey.c.fips openssl-1.0.0f/crypto/rc2/rc2
  	int i,j;
  	unsigned char *k;
  	RC2_INT *ki;
-diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl.fips openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl
---- openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl.fips	2009-02-12 15:48:49.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl	2012-01-05 13:22:30.000000000 +0100
-@@ -202,4 +202,6 @@ RC4_options:
- .string	"rc4(8x,char)"
- ___
- 
-+$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
-+
- print $code;
-diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl.fips openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl
---- openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl.fips	2009-04-27 21:31:04.000000000 +0200
-+++ openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl	2012-01-05 13:22:30.000000000 +0100
-@@ -499,6 +499,8 @@ ___
- 
- $code =~ s/#([bwd])/$1/gm;
- 
-+$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
-+
- print $code;
- 
- close STDOUT;
-diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl.fips openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl
---- openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl.fips	2007-12-02 22:32:03.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.fips openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl	2013-02-19 20:12:54.600664970 +0100
 @@ -166,8 +166,12 @@ $idx="edx";
  
  &external_label("OPENSSL_ia32cap_P");
@@ -10894,9 +10869,31 @@ diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl.fips openssl-1.0.0f/crypto/rc4
  
  # const char *RC4_options(void);
  &function_begin_B("RC4_options");
-diff -up openssl-1.0.0f/crypto/rc4/Makefile.fips openssl-1.0.0f/crypto/rc4/Makefile
---- openssl-1.0.0f/crypto/rc4/Makefile.fips	2009-02-11 11:01:36.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/Makefile	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl.fips openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl	2013-02-19 20:12:54.600664970 +0100
+@@ -202,4 +202,6 @@ RC4_options:
+ .string	"rc4(8x,char)"
+ ___
+ 
++$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
++
+ print $code;
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.fips openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl	2013-02-19 20:12:54.600664970 +0100
+@@ -499,6 +499,8 @@ ___
+ 
+ $code =~ s/#([bwd])/$1/gm;
+ 
++$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
++
+ print $code;
+ 
+ close STDOUT;
+diff -up openssl-1.0.0k/crypto/rc4/Makefile.fips openssl-1.0.0k/crypto/rc4/Makefile
+--- openssl-1.0.0k/crypto/rc4/Makefile.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/Makefile	2013-02-19 20:12:54.600664970 +0100
 @@ -21,8 +21,8 @@ TEST=rc4test.c
  APPS=
  
@@ -10908,9 +10905,9 @@ diff -up openssl-1.0.0f/crypto/rc4/Makefile.fips openssl-1.0.0f/crypto/rc4/Makef
  
  SRC= $(LIBSRC)
  
-diff -up openssl-1.0.0f/crypto/rc4/rc4_fblk.c.fips openssl-1.0.0f/crypto/rc4/rc4_fblk.c
---- openssl-1.0.0f/crypto/rc4/rc4_fblk.c.fips	2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/rc4_fblk.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/rc4_fblk.c.fips openssl-1.0.0k/crypto/rc4/rc4_fblk.c
+--- openssl-1.0.0k/crypto/rc4/rc4_fblk.c.fips	2013-02-19 20:12:54.601664990 +0100
++++ openssl-1.0.0k/crypto/rc4/rc4_fblk.c	2013-02-19 20:12:54.601664990 +0100
 @@ -0,0 +1,75 @@
 +/* crypto/rc4/rc4_fblk.c */
 +/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -10987,9 +10984,9 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4_fblk.c.fips openssl-1.0.0f/crypto/rc4/rc4
 +	}
 +#endif
 +
-diff -up openssl-1.0.0f/crypto/rc4/rc4.h.fips openssl-1.0.0f/crypto/rc4/rc4.h
---- openssl-1.0.0f/crypto/rc4/rc4.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/rc4.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/rc4.h.fips openssl-1.0.0k/crypto/rc4/rc4.h
+--- openssl-1.0.0k/crypto/rc4/rc4.h.fips	2013-02-19 20:12:53.860650927 +0100
++++ openssl-1.0.0k/crypto/rc4/rc4.h	2013-02-19 20:12:54.601664990 +0100
 @@ -78,6 +78,9 @@ typedef struct rc4_key_st
  
   
@@ -11000,9 +10997,9 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4.h.fips openssl-1.0.0f/crypto/rc4/rc4.h
  void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
  void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
  		unsigned char *outdata);
-diff -up openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips openssl-1.0.0f/crypto/rc4/rc4_skey.c
---- openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips	2007-01-21 14:07:13.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/rc4_skey.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/rc4_skey.c.fips openssl-1.0.0k/crypto/rc4/rc4_skey.c
+--- openssl-1.0.0k/crypto/rc4/rc4_skey.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/rc4_skey.c	2013-02-19 20:12:54.601664990 +0100
 @@ -59,6 +59,11 @@
  #include <openssl/rc4.h>
  #include "rc4_locl.h"
@@ -11027,7 +11024,7 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips openssl-1.0.0f/crypto/rc4/rc4
  	{
          register RC4_INT tmp;
          register int id1,id2;
-@@ -126,7 +135,12 @@ void RC4_set_key(RC4_KEY *key, int len, 
+@@ -126,7 +135,12 @@ void RC4_set_key(RC4_KEY *key, int len,
  		 * module...
  		 *				<appro at fy.chalmers.se>
  		 */
@@ -11040,9 +11037,9 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips openssl-1.0.0f/crypto/rc4/rc4
  			unsigned char *cp=(unsigned char *)d;
  
  			for (i=0;i<256;i++) cp[i]=i;
-diff -up openssl-1.0.0f/crypto/ripemd/ripemd.h.fips openssl-1.0.0f/crypto/ripemd/ripemd.h
---- openssl-1.0.0f/crypto/ripemd/ripemd.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/ripemd/ripemd.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/ripemd/ripemd.h.fips openssl-1.0.0k/crypto/ripemd/ripemd.h
+--- openssl-1.0.0k/crypto/ripemd/ripemd.h.fips	2013-02-19 20:12:54.170656810 +0100
++++ openssl-1.0.0k/crypto/ripemd/ripemd.h	2013-02-19 20:12:54.601664990 +0100
 @@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st
  	unsigned int   num;
  	} RIPEMD160_CTX;
@@ -11053,9 +11050,9 @@ diff -up openssl-1.0.0f/crypto/ripemd/ripemd.h.fips openssl-1.0.0f/crypto/ripemd
  int RIPEMD160_Init(RIPEMD160_CTX *c);
  int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
  int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);
-diff -up openssl-1.0.0f/crypto/ripemd/rmd_dgst.c.fips openssl-1.0.0f/crypto/ripemd/rmd_dgst.c
---- openssl-1.0.0f/crypto/ripemd/rmd_dgst.c.fips	2007-01-21 14:07:13.000000000 +0100
-+++ openssl-1.0.0f/crypto/ripemd/rmd_dgst.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/ripemd/rmd_dgst.c.fips openssl-1.0.0k/crypto/ripemd/rmd_dgst.c
+--- openssl-1.0.0k/crypto/ripemd/rmd_dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/ripemd/rmd_dgst.c	2013-02-19 20:12:54.601664990 +0100
 @@ -59,6 +59,11 @@
  #include <stdio.h>
  #include "rmd_locl.h"
@@ -11077,9 +11074,9 @@ diff -up openssl-1.0.0f/crypto/ripemd/rmd_dgst.c.fips openssl-1.0.0f/crypto/ripe
  	{
  	memset (c,0,sizeof(*c));
  	c->A=RIPEMD160_A;
-diff -up openssl-1.0.0f/crypto/rsa/rsa_eay.c.fips openssl-1.0.0f/crypto/rsa/rsa_eay.c
---- openssl-1.0.0f/crypto/rsa/rsa_eay.c.fips	2011-10-19 16:58:34.000000000 +0200
-+++ openssl-1.0.0f/crypto/rsa/rsa_eay.c	2012-01-05 13:27:00.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_eay.c.fips openssl-1.0.0k/crypto/rsa/rsa_eay.c
+--- openssl-1.0.0k/crypto/rsa/rsa_eay.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_eay.c	2013-02-19 20:12:54.601664990 +0100
 @@ -114,6 +114,10 @@
  #include <openssl/bn.h>
  #include <openssl/rsa.h>
@@ -11340,9 +11337,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_eay.c.fips openssl-1.0.0f/crypto/rsa/rsa_
  	rsa->flags|=RSA_FLAG_CACHE_PUBLIC|RSA_FLAG_CACHE_PRIVATE;
  	return(1);
  	}
-diff -up openssl-1.0.0f/crypto/rsa/rsa_err.c.fips openssl-1.0.0f/crypto/rsa/rsa_err.c
---- openssl-1.0.0f/crypto/rsa/rsa_err.c.fips	2008-12-29 17:11:56.000000000 +0100
-+++ openssl-1.0.0f/crypto/rsa/rsa_err.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_err.c.fips openssl-1.0.0k/crypto/rsa/rsa_err.c
+--- openssl-1.0.0k/crypto/rsa/rsa_err.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_err.c	2013-02-19 20:12:54.602665009 +0100
 @@ -111,8 +111,12 @@ static ERR_STRING_DATA RSA_str_functs[]=
  {ERR_FUNC(RSA_F_RSA_PRINT_FP),	"RSA_print_fp"},
  {ERR_FUNC(RSA_F_RSA_PRIV_DECODE),	"RSA_PRIV_DECODE"},
@@ -11369,9 +11366,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_err.c.fips openssl-1.0.0f/crypto/rsa/rsa_
  {ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"},
  {ERR_REASON(RSA_R_PADDING_CHECK_FAILED)  ,"padding check failed"},
  {ERR_REASON(RSA_R_P_NOT_PRIME)           ,"p not prime"},
-diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_gen.c
---- openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips	2007-03-28 02:15:27.000000000 +0200
-+++ openssl-1.0.0f/crypto/rsa/rsa_gen.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_gen.c.fips openssl-1.0.0k/crypto/rsa/rsa_gen.c
+--- openssl-1.0.0k/crypto/rsa/rsa_gen.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_gen.c	2013-02-19 20:12:54.602665009 +0100
 @@ -67,6 +67,82 @@
  #include "cryptlib.h"
  #include <openssl/bn.h>
@@ -11455,7 +11452,7 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_
  
  static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb);
  
-@@ -90,6 +166,23 @@ static int rsa_builtin_keygen(RSA *rsa, 
+@@ -90,6 +166,23 @@ static int rsa_builtin_keygen(RSA *rsa,
  	int bitsp,bitsq,ok= -1,n=0;
  	BN_CTX *ctx=NULL;
  
@@ -11479,7 +11476,7 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_
  	ctx=BN_CTX_new();
  	if (ctx == NULL) goto err;
  	BN_CTX_start(ctx);
-@@ -201,6 +294,17 @@ static int rsa_builtin_keygen(RSA *rsa, 
+@@ -201,6 +294,17 @@ static int rsa_builtin_keygen(RSA *rsa,
  		p = rsa->p;
  	if (!BN_mod_inverse(rsa->iqmp,rsa->q,p,ctx)) goto err;
  
@@ -11497,9 +11494,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_
  	ok=1;
  err:
  	if (ok == -1)
-diff -up openssl-1.0.0f/crypto/rsa/rsa.h.fips openssl-1.0.0f/crypto/rsa/rsa.h
---- openssl-1.0.0f/crypto/rsa/rsa.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rsa/rsa.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa.h.fips openssl-1.0.0k/crypto/rsa/rsa.h
+--- openssl-1.0.0k/crypto/rsa/rsa.h.fips	2013-02-19 20:12:54.354660303 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa.h	2013-02-19 20:12:54.602665009 +0100
 @@ -74,6 +74,21 @@
  #error RSA is disabled.
  #endif
@@ -11569,9 +11566,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa.h.fips openssl-1.0.0f/crypto/rsa/rsa.h
  #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE	 148
  #define RSA_R_PADDING_CHECK_FAILED			 114
  #define RSA_R_P_NOT_PRIME				 128
-diff -up openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips openssl-1.0.0f/crypto/rsa/rsa_lib.c
---- openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips	2009-12-09 14:38:20.000000000 +0100
-+++ openssl-1.0.0f/crypto/rsa/rsa_lib.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_lib.c.fips openssl-1.0.0k/crypto/rsa/rsa_lib.c
+--- openssl-1.0.0k/crypto/rsa/rsa_lib.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_lib.c	2013-02-19 20:12:54.602665009 +0100
 @@ -80,6 +80,13 @@ RSA *RSA_new(void)
  
  void RSA_set_default_method(const RSA_METHOD *meth)
@@ -11633,7 +11630,7 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips openssl-1.0.0f/crypto/rsa/rsa_
  	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
  	}
  
-@@ -306,6 +339,13 @@ int RSA_private_decrypt(int flen, const 
+@@ -306,6 +339,13 @@ int RSA_private_decrypt(int flen, const
  int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
  	     RSA *rsa, int padding)
  	{
@@ -11647,9 +11644,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips openssl-1.0.0f/crypto/rsa/rsa_
  	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
  	}
  
-diff -up openssl-1.0.0f/crypto/rsa/rsa_sign.c.fips openssl-1.0.0f/crypto/rsa/rsa_sign.c
---- openssl-1.0.0f/crypto/rsa/rsa_sign.c.fips	2007-04-24 03:05:42.000000000 +0200
-+++ openssl-1.0.0f/crypto/rsa/rsa_sign.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_sign.c.fips openssl-1.0.0k/crypto/rsa/rsa_sign.c
+--- openssl-1.0.0k/crypto/rsa/rsa_sign.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_sign.c	2013-02-19 20:12:54.603665028 +0100
 @@ -130,7 +130,8 @@ int RSA_sign(int type, const unsigned ch
  		i2d_X509_SIG(&sig,&p);
  		s=tmps;
@@ -11681,9 +11678,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_sign.c.fips openssl-1.0.0f/crypto/rsa/rsa
  
  	if (i <= 0) goto err;
  
-diff -up openssl-1.0.0f/crypto/seed/seed.c.fips openssl-1.0.0f/crypto/seed/seed.c
---- openssl-1.0.0f/crypto/seed/seed.c.fips	2008-12-16 08:41:21.000000000 +0100
-+++ openssl-1.0.0f/crypto/seed/seed.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/seed/seed.c.fips openssl-1.0.0k/crypto/seed/seed.c
+--- openssl-1.0.0k/crypto/seed/seed.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/seed/seed.c	2013-02-19 20:12:54.603665028 +0100
 @@ -34,6 +34,9 @@
  
  #include <openssl/seed.h>
@@ -11699,7 +11696,7 @@ diff -up openssl-1.0.0f/crypto/seed/seed.c.fips openssl-1.0.0f/crypto/seed/seed.
  #endif
  
 +#ifdef OPENSSL_FIPS
- void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
++void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
 +        {
 +        if (FIPS_mode())
 +                FIPS_BAD_ABORT(SEED)
@@ -11708,14 +11705,14 @@ diff -up openssl-1.0.0f/crypto/seed/seed.c.fips openssl-1.0.0f/crypto/seed/seed.
 +
 +void private_SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
 +#else
-+void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
+ void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
 +#endif
  {
  	seed_word x1, x2, x3, x4;
  	seed_word t0, t1;
-diff -up openssl-1.0.0f/crypto/seed/seed.h.fips openssl-1.0.0f/crypto/seed/seed.h
---- openssl-1.0.0f/crypto/seed/seed.h.fips	2012-01-05 13:22:28.000000000 +0100
-+++ openssl-1.0.0f/crypto/seed/seed.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/seed/seed.h.fips openssl-1.0.0k/crypto/seed/seed.h
+--- openssl-1.0.0k/crypto/seed/seed.h.fips	2013-02-19 20:12:54.022654004 +0100
++++ openssl-1.0.0k/crypto/seed/seed.h	2013-02-19 20:12:54.603665028 +0100
 @@ -117,6 +117,9 @@ typedef struct seed_key_st {
  } SEED_KEY_SCHEDULE;
  
@@ -11726,57 +11723,9 @@ diff -up openssl-1.0.0f/crypto/seed/seed.h.fips openssl-1.0.0f/crypto/seed/seed.
  void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks);
  
  void SEED_encrypt(const unsigned char s[SEED_BLOCK_SIZE], unsigned char d[SEED_BLOCK_SIZE], const SEED_KEY_SCHEDULE *ks);
-diff -up openssl-1.0.0f/crypto/sha/sha_dgst.c.fips openssl-1.0.0f/crypto/sha/sha_dgst.c
---- openssl-1.0.0f/crypto/sha/sha_dgst.c.fips	2007-01-21 14:07:14.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha_dgst.c	2012-01-05 13:22:30.000000000 +0100
-@@ -57,6 +57,12 @@
-  */
- 
- #include <openssl/opensslconf.h>
-+#include <openssl/crypto.h>
-+#ifdef OPENSSL_FIPS
-+#include <openssl/fips.h>
-+#endif
-+
-+#include <openssl/err.h>
- #if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
- 
- #undef  SHA_1
-diff -up openssl-1.0.0f/crypto/sha/sha.h.fips openssl-1.0.0f/crypto/sha/sha.h
---- openssl-1.0.0f/crypto/sha/sha.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha.h	2012-01-05 13:22:30.000000000 +0100
-@@ -106,6 +106,9 @@ typedef struct SHAstate_st
- 	} SHA_CTX;
- 
- #ifndef OPENSSL_NO_SHA0
-+#ifdef OPENSSL_FIPS
-+int private_SHA_Init(SHA_CTX *c);
-+#endif
- int SHA_Init(SHA_CTX *c);
- int SHA_Update(SHA_CTX *c, const void *data, size_t len);
- int SHA_Final(unsigned char *md, SHA_CTX *c);
-diff -up openssl-1.0.0f/crypto/sha/sha_locl.h.fips openssl-1.0.0f/crypto/sha/sha_locl.h
---- openssl-1.0.0f/crypto/sha/sha_locl.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha_locl.h	2012-01-05 13:22:30.000000000 +0100
-@@ -122,8 +122,15 @@ void sha1_block_data_order (SHA_CTX *c, 
- #define INIT_DATA_h3 0x10325476UL
- #define INIT_DATA_h4 0xc3d2e1f0UL
- 
-+#if defined(SHA_0) && defined(OPENSSL_FIPS)
-+FIPS_NON_FIPS_MD_Init(SHA)
-+#else
- int HASH_INIT (SHA_CTX *c)
-+#endif
- 	{
-+#if defined(SHA_1) && defined(OPENSSL_FIPS)
-+	FIPS_selftest_check();
-+#endif
- 	memset (c,0,sizeof(*c));
- 	c->h0=INIT_DATA_h0;
- 	c->h1=INIT_DATA_h1;
-diff -up openssl-1.0.0f/crypto/sha/sha1dgst.c.fips openssl-1.0.0f/crypto/sha/sha1dgst.c
---- openssl-1.0.0f/crypto/sha/sha1dgst.c.fips	2007-01-21 14:07:14.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha1dgst.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha1dgst.c.fips openssl-1.0.0k/crypto/sha/sha1dgst.c
+--- openssl-1.0.0k/crypto/sha/sha1dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha1dgst.c	2013-02-19 20:12:54.604665047 +0100
 @@ -63,6 +63,10 @@
  #define SHA_1
  
@@ -11788,9 +11737,9 @@ diff -up openssl-1.0.0f/crypto/sha/sha1dgst.c.fips openssl-1.0.0f/crypto/sha/sha
  
  const char SHA1_version[]="SHA1" OPENSSL_VERSION_PTEXT;
  
-diff -up openssl-1.0.0f/crypto/sha/sha256.c.fips openssl-1.0.0f/crypto/sha/sha256.c
---- openssl-1.0.0f/crypto/sha/sha256.c.fips	2007-01-21 14:07:14.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha256.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha256.c.fips openssl-1.0.0k/crypto/sha/sha256.c
+--- openssl-1.0.0k/crypto/sha/sha256.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha256.c	2013-02-19 20:12:54.604665047 +0100
 @@ -12,12 +12,19 @@
  
  #include <openssl/crypto.h>
@@ -11821,9 +11770,9 @@ diff -up openssl-1.0.0f/crypto/sha/sha256.c.fips openssl-1.0.0f/crypto/sha/sha25
  	memset (c,0,sizeof(*c));
  	c->h[0]=0x6a09e667UL;	c->h[1]=0xbb67ae85UL;
  	c->h[2]=0x3c6ef372UL;	c->h[3]=0xa54ff53aUL;
-diff -up openssl-1.0.0f/crypto/sha/sha512.c.fips openssl-1.0.0f/crypto/sha/sha512.c
---- openssl-1.0.0f/crypto/sha/sha512.c.fips	2009-12-30 12:53:33.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha512.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha512.c.fips openssl-1.0.0k/crypto/sha/sha512.c
+--- openssl-1.0.0k/crypto/sha/sha512.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha512.c	2013-02-19 20:12:54.604665047 +0100
 @@ -5,6 +5,10 @@
   * ====================================================================
   */
@@ -11855,9 +11804,57 @@ diff -up openssl-1.0.0f/crypto/sha/sha512.c.fips openssl-1.0.0f/crypto/sha/sha51
  #if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
  	/* maintain dword order required by assembler module */
  	unsigned int *h = (unsigned int *)c->h;
-diff -up openssl-1.0.0f/crypto/whrlpool/whrlpool.h.fips openssl-1.0.0f/crypto/whrlpool/whrlpool.h
---- openssl-1.0.0f/crypto/whrlpool/whrlpool.h.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/whrlpool/whrlpool.h	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha_dgst.c.fips openssl-1.0.0k/crypto/sha/sha_dgst.c
+--- openssl-1.0.0k/crypto/sha/sha_dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha_dgst.c	2013-02-19 20:12:54.603665028 +0100
+@@ -57,6 +57,12 @@
+  */
+ 
+ #include <openssl/opensslconf.h>
++#include <openssl/crypto.h>
++#ifdef OPENSSL_FIPS
++#include <openssl/fips.h>
++#endif
++
++#include <openssl/err.h>
+ #if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
+ 
+ #undef  SHA_1
+diff -up openssl-1.0.0k/crypto/sha/sha.h.fips openssl-1.0.0k/crypto/sha/sha.h
+--- openssl-1.0.0k/crypto/sha/sha.h.fips	2013-02-19 20:12:53.892651535 +0100
++++ openssl-1.0.0k/crypto/sha/sha.h	2013-02-19 20:12:54.603665028 +0100
+@@ -106,6 +106,9 @@ typedef struct SHAstate_st
+ 	} SHA_CTX;
+ 
+ #ifndef OPENSSL_NO_SHA0
++#ifdef OPENSSL_FIPS
++int private_SHA_Init(SHA_CTX *c);
++#endif
+ int SHA_Init(SHA_CTX *c);
+ int SHA_Update(SHA_CTX *c, const void *data, size_t len);
+ int SHA_Final(unsigned char *md, SHA_CTX *c);
+diff -up openssl-1.0.0k/crypto/sha/sha_locl.h.fips openssl-1.0.0k/crypto/sha/sha_locl.h
+--- openssl-1.0.0k/crypto/sha/sha_locl.h.fips	2013-02-19 20:12:53.897651631 +0100
++++ openssl-1.0.0k/crypto/sha/sha_locl.h	2013-02-19 20:12:54.603665028 +0100
+@@ -122,8 +122,15 @@ void sha1_block_data_order (SHA_CTX *c,
+ #define INIT_DATA_h3 0x10325476UL
+ #define INIT_DATA_h4 0xc3d2e1f0UL
+ 
++#if defined(SHA_0) && defined(OPENSSL_FIPS)
++FIPS_NON_FIPS_MD_Init(SHA)
++#else
+ int HASH_INIT (SHA_CTX *c)
++#endif
+ 	{
++#if defined(SHA_1) && defined(OPENSSL_FIPS)
++	FIPS_selftest_check();
++#endif
+ 	memset (c,0,sizeof(*c));
+ 	c->h0=INIT_DATA_h0;
+ 	c->h1=INIT_DATA_h1;
+diff -up openssl-1.0.0k/crypto/whrlpool/whrlpool.h.fips openssl-1.0.0k/crypto/whrlpool/whrlpool.h
+--- openssl-1.0.0k/crypto/whrlpool/whrlpool.h.fips	2013-02-19 20:12:54.187657134 +0100
++++ openssl-1.0.0k/crypto/whrlpool/whrlpool.h	2013-02-19 20:12:54.604665047 +0100
 @@ -24,6 +24,9 @@ typedef struct	{
  	} WHIRLPOOL_CTX;
  
@@ -11868,9 +11865,9 @@ diff -up openssl-1.0.0f/crypto/whrlpool/whrlpool.h.fips openssl-1.0.0f/crypto/wh
  int WHIRLPOOL_Init	(WHIRLPOOL_CTX *c);
  int WHIRLPOOL_Update	(WHIRLPOOL_CTX *c,const void *inp,size_t bytes);
  void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits);
-diff -up openssl-1.0.0f/crypto/whrlpool/wp_dgst.c.fips openssl-1.0.0f/crypto/whrlpool/wp_dgst.c
---- openssl-1.0.0f/crypto/whrlpool/wp_dgst.c.fips	2008-12-29 13:35:49.000000000 +0100
-+++ openssl-1.0.0f/crypto/whrlpool/wp_dgst.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/whrlpool/wp_dgst.c.fips openssl-1.0.0k/crypto/whrlpool/wp_dgst.c
+--- openssl-1.0.0k/crypto/whrlpool/wp_dgst.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/whrlpool/wp_dgst.c	2013-02-19 20:12:54.604665047 +0100
 @@ -53,8 +53,12 @@
  
  #include "wp_locl.h"
@@ -11885,9 +11882,9 @@ diff -up openssl-1.0.0f/crypto/whrlpool/wp_dgst.c.fips openssl-1.0.0f/crypto/whr
  	{
  	memset (c,0,sizeof(*c));
  	return(1);
-diff -up openssl-1.0.0f/Makefile.org.fips openssl-1.0.0f/Makefile.org
---- openssl-1.0.0f/Makefile.org.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/Makefile.org	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/Makefile.org.fips openssl-1.0.0k/Makefile.org
+--- openssl-1.0.0k/Makefile.org.fips	2013-02-19 20:12:54.544663908 +0100
++++ openssl-1.0.0k/Makefile.org	2013-02-19 20:12:54.604665047 +0100
 @@ -110,6 +110,9 @@ LIBKRB5=
  ZLIB_INCLUDE=
  LIBZLIB=
@@ -11915,9 +11912,124 @@ diff -up openssl-1.0.0f/Makefile.org.fips openssl-1.0.0f/Makefile.org
  		THIS=$${THIS:-$@} MAKEFILE=Makefile MAKEOVERRIDES=
  # MAKEOVERRIDES= effectively "equalizes" GNU-ish and SysV-ish make flavors,
  # which in turn eliminates ambiguities in variable treatment with -e.
-diff -up openssl-1.0.0f/ssl/ssl_ciph.c.fips openssl-1.0.0f/ssl/ssl_ciph.c
---- openssl-1.0.0f/ssl/ssl_ciph.c.fips	2011-12-02 13:51:05.000000000 +0100
-+++ openssl-1.0.0f/ssl/ssl_ciph.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/ssl/s23_clnt.c.fips openssl-1.0.0k/ssl/s23_clnt.c
+--- openssl-1.0.0k/ssl/s23_clnt.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s23_clnt.c	2013-02-19 20:12:54.607665104 +0100
+@@ -334,6 +334,14 @@ static int ssl23_client_hello(SSL *s)
+ 			version_major = TLS1_VERSION_MAJOR;
+ 			version_minor = TLS1_VERSION_MINOR;
+ 			}
++#ifdef OPENSSL_FIPS
++		else if(FIPS_mode())
++			{
++			SSLerr(SSL_F_SSL23_CLIENT_HELLO,
++					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
++			return -1;
++			}
++#endif
+ 		else if (version == SSL3_VERSION)
+ 			{
+ 			version_major = SSL3_VERSION_MAJOR;
+@@ -617,6 +625,14 @@ static int ssl23_get_server_hello(SSL *s
+ 		if ((p[2] == SSL3_VERSION_MINOR) &&
+ 			!(s->options & SSL_OP_NO_SSLv3))
+ 			{
++#ifdef OPENSSL_FIPS
++			if(FIPS_mode())
++				{
++				SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,
++					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
++				goto err;
++				}
++#endif
+ 			s->version=SSL3_VERSION;
+ 			s->method=SSLv3_client_method();
+ 			}
+diff -up openssl-1.0.0k/ssl/s23_srvr.c.fips openssl-1.0.0k/ssl/s23_srvr.c
+--- openssl-1.0.0k/ssl/s23_srvr.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s23_srvr.c	2013-02-19 20:12:54.607665104 +0100
+@@ -393,6 +393,15 @@ int ssl23_get_client_hello(SSL *s)
+ 			}
+ 		}
+ 
++#ifdef OPENSSL_FIPS
++	if (FIPS_mode() && (s->version < TLS1_VERSION))
++		{
++		SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,
++					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
++		goto err;
++		}
++#endif
++
+ 	if (s->state == SSL23_ST_SR_CLNT_HELLO_B)
+ 		{
+ 		/* we have SSLv3/TLSv1 in an SSLv2 header
+diff -up openssl-1.0.0k/ssl/s3_clnt.c.fips openssl-1.0.0k/ssl/s3_clnt.c
+--- openssl-1.0.0k/ssl/s3_clnt.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s3_clnt.c	2013-02-19 20:12:54.608665123 +0100
+@@ -156,6 +156,10 @@
+ #include <openssl/objects.h>
+ #include <openssl/evp.h>
+ #include <openssl/md5.h>
++#ifdef OPENSSL_FIPS
++#include <openssl/fips.h>
++#endif
++
+ #ifndef OPENSSL_NO_DH
+ #include <openssl/dh.h>
+ #endif
+@@ -1559,6 +1563,8 @@ int ssl3_get_key_exchange(SSL *s)
+ 			q=md_buf;
+ 			for (num=2; num > 0; num--)
+ 				{
++				EVP_MD_CTX_set_flags(&md_ctx,
++					EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ 				EVP_DigestInit_ex(&md_ctx,(num == 2)
+ 					?s->ctx->md5:s->ctx->sha1, NULL);
+ 				EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
+diff -up openssl-1.0.0k/ssl/s3_enc.c.fips openssl-1.0.0k/ssl/s3_enc.c
+--- openssl-1.0.0k/ssl/s3_enc.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s3_enc.c	2013-02-19 20:12:54.609665142 +0100
+@@ -170,6 +170,7 @@ static int ssl3_generate_key_block(SSL *
+ #endif
+ 	k=0;
+ 	EVP_MD_CTX_init(&m5);
++	EVP_MD_CTX_set_flags(&m5, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ 	EVP_MD_CTX_init(&s1);
+ 	for (i=0; (int)i<num; i+=MD5_DIGEST_LENGTH)
+ 		{
+@@ -609,6 +610,8 @@ int ssl3_digest_cached_records(SSL *s)
+ 		if ((mask & s->s3->tmp.new_cipher->algorithm2) && md) 
+ 			{
+ 			s->s3->handshake_dgst[i]=EVP_MD_CTX_create();
++			EVP_MD_CTX_set_flags(s->s3->handshake_dgst[i],
++				EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ 			EVP_DigestInit_ex(s->s3->handshake_dgst[i],md,NULL);
+ 			EVP_DigestUpdate(s->s3->handshake_dgst[i],hdata,hdatalen);
+ 			} 
+@@ -665,6 +668,7 @@ static int ssl3_handshake_mac(SSL *s, in
+ 		return 0;
+ 	}	
+ 	EVP_MD_CTX_init(&ctx);
++	EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ 	EVP_MD_CTX_copy_ex(&ctx,d);
+ 	n=EVP_MD_CTX_size(&ctx);
+ 	if (n < 0)
+diff -up openssl-1.0.0k/ssl/s3_srvr.c.fips openssl-1.0.0k/ssl/s3_srvr.c
+--- openssl-1.0.0k/ssl/s3_srvr.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s3_srvr.c	2013-02-19 20:12:54.609665142 +0100
+@@ -1779,6 +1779,8 @@ int ssl3_send_server_key_exchange(SSL *s
+ 				j=0;
+ 				for (num=2; num > 0; num--)
+ 					{
++					EVP_MD_CTX_set_flags(&md_ctx,
++						EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ 					EVP_DigestInit_ex(&md_ctx,(num == 2)
+ 						?s->ctx->md5:s->ctx->sha1, NULL);
+ 					EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
+diff -up openssl-1.0.0k/ssl/ssl_ciph.c.fips openssl-1.0.0k/ssl/ssl_ciph.c
+--- openssl-1.0.0k/ssl/ssl_ciph.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/ssl_ciph.c	2013-02-19 20:12:54.605665066 +0100
 @@ -728,6 +728,9 @@ static void ssl_cipher_collect_ciphers(c
  		    !(c->algorithm_auth & disabled_auth) &&
  		    !(c->algorithm_enc & disabled_enc) &&
@@ -11940,10 +12052,10 @@ diff -up openssl-1.0.0f/ssl/ssl_ciph.c.fips openssl-1.0.0f/ssl/ssl_ciph.c
  			{
  			sk_SSL_CIPHER_push(cipherstack, curr->cipher);
  #ifdef CIPHER_DEBUG
-diff -up openssl-1.0.0f/ssl/ssl_lib.c.fips openssl-1.0.0f/ssl/ssl_lib.c
---- openssl-1.0.0f/ssl/ssl_lib.c.fips	2011-09-26 19:04:49.000000000 +0200
-+++ openssl-1.0.0f/ssl/ssl_lib.c	2012-01-05 13:22:30.000000000 +0100
-@@ -1524,6 +1524,14 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *m
+diff -up openssl-1.0.0k/ssl/ssl_lib.c.fips openssl-1.0.0k/ssl/ssl_lib.c
+--- openssl-1.0.0k/ssl/ssl_lib.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/ssl_lib.c	2013-02-19 20:12:54.605665066 +0100
+@@ -1526,6 +1526,14 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *m
  		return(NULL);
  		}
  
@@ -11958,9 +12070,9 @@ diff -up openssl-1.0.0f/ssl/ssl_lib.c.fips openssl-1.0.0f/ssl/ssl_lib.c
  	if (SSL_get_ex_data_X509_STORE_CTX_idx() < 0)
  		{
  		SSLerr(SSL_F_SSL_CTX_NEW,SSL_R_X509_VERIFICATION_SETUP_PROBLEMS);
-diff -up openssl-1.0.0f/ssl/ssltest.c.fips openssl-1.0.0f/ssl/ssltest.c
---- openssl-1.0.0f/ssl/ssltest.c.fips	2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/ssl/ssltest.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/ssl/ssltest.c.fips openssl-1.0.0k/ssl/ssltest.c
+--- openssl-1.0.0k/ssl/ssltest.c.fips	2013-02-19 20:12:54.542663869 +0100
++++ openssl-1.0.0k/ssl/ssltest.c	2013-02-19 20:12:54.606665085 +0100
 @@ -268,6 +268,9 @@ static void sv_usage(void)
  	{
  	fprintf(stderr,"usage: ssltest [args ...]\n");
@@ -12035,124 +12147,9 @@ diff -up openssl-1.0.0f/ssl/ssltest.c.fips openssl-1.0.0f/ssl/ssltest.c
  	if(s->version == TLS1_VERSION)
  		FIPS_allow_md5(0);
  # endif
-diff -up openssl-1.0.0f/ssl/s23_clnt.c.fips openssl-1.0.0f/ssl/s23_clnt.c
---- openssl-1.0.0f/ssl/s23_clnt.c.fips	2010-02-16 15:20:40.000000000 +0100
-+++ openssl-1.0.0f/ssl/s23_clnt.c	2012-01-05 13:22:30.000000000 +0100
-@@ -334,6 +334,14 @@ static int ssl23_client_hello(SSL *s)
- 			version_major = TLS1_VERSION_MAJOR;
- 			version_minor = TLS1_VERSION_MINOR;
- 			}
-+#ifdef OPENSSL_FIPS
-+		else if(FIPS_mode())
-+			{
-+			SSLerr(SSL_F_SSL23_CLIENT_HELLO,
-+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
-+			return -1;
-+			}
-+#endif
- 		else if (version == SSL3_VERSION)
- 			{
- 			version_major = SSL3_VERSION_MAJOR;
-@@ -617,6 +625,14 @@ static int ssl23_get_server_hello(SSL *s
- 		if ((p[2] == SSL3_VERSION_MINOR) &&
- 			!(s->options & SSL_OP_NO_SSLv3))
- 			{
-+#ifdef OPENSSL_FIPS
-+			if(FIPS_mode())
-+				{
-+				SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,
-+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
-+				goto err;
-+				}
-+#endif
- 			s->version=SSL3_VERSION;
- 			s->method=SSLv3_client_method();
- 			}
-diff -up openssl-1.0.0f/ssl/s23_srvr.c.fips openssl-1.0.0f/ssl/s23_srvr.c
---- openssl-1.0.0f/ssl/s23_srvr.c.fips	2010-02-16 15:20:40.000000000 +0100
-+++ openssl-1.0.0f/ssl/s23_srvr.c	2012-01-05 13:22:30.000000000 +0100
-@@ -393,6 +393,15 @@ int ssl23_get_client_hello(SSL *s)
- 			}
- 		}
- 
-+#ifdef OPENSSL_FIPS
-+	if (FIPS_mode() && (s->version < TLS1_VERSION))
-+		{
-+		SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,
-+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
-+		goto err;
-+		}
-+#endif
-+
- 	if (s->state == SSL23_ST_SR_CLNT_HELLO_B)
- 		{
- 		/* we have SSLv3/TLSv1 in an SSLv2 header
-diff -up openssl-1.0.0f/ssl/s3_clnt.c.fips openssl-1.0.0f/ssl/s3_clnt.c
---- openssl-1.0.0f/ssl/s3_clnt.c.fips	2011-12-26 20:38:19.000000000 +0100
-+++ openssl-1.0.0f/ssl/s3_clnt.c	2012-01-05 13:22:30.000000000 +0100
-@@ -156,6 +156,10 @@
- #include <openssl/objects.h>
- #include <openssl/evp.h>
- #include <openssl/md5.h>
-+#ifdef OPENSSL_FIPS
-+#include <openssl/fips.h>
-+#endif
-+
- #ifndef OPENSSL_NO_DH
- #include <openssl/dh.h>
- #endif
-@@ -1550,6 +1554,8 @@ int ssl3_get_key_exchange(SSL *s)
- 			q=md_buf;
- 			for (num=2; num > 0; num--)
- 				{
-+				EVP_MD_CTX_set_flags(&md_ctx,
-+					EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- 				EVP_DigestInit_ex(&md_ctx,(num == 2)
- 					?s->ctx->md5:s->ctx->sha1, NULL);
- 				EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-diff -up openssl-1.0.0f/ssl/s3_enc.c.fips openssl-1.0.0f/ssl/s3_enc.c
---- openssl-1.0.0f/ssl/s3_enc.c.fips	2012-01-04 16:38:54.000000000 +0100
-+++ openssl-1.0.0f/ssl/s3_enc.c	2012-01-05 13:22:30.000000000 +0100
-@@ -170,6 +170,7 @@ static int ssl3_generate_key_block(SSL *
- #endif
- 	k=0;
- 	EVP_MD_CTX_init(&m5);
-+	EVP_MD_CTX_set_flags(&m5, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- 	EVP_MD_CTX_init(&s1);
- 	for (i=0; (int)i<num; i+=MD5_DIGEST_LENGTH)
- 		{
-@@ -616,6 +617,8 @@ int ssl3_digest_cached_records(SSL *s)
- 		if ((mask & s->s3->tmp.new_cipher->algorithm2) && md) 
- 			{
- 			s->s3->handshake_dgst[i]=EVP_MD_CTX_create();
-+			EVP_MD_CTX_set_flags(s->s3->handshake_dgst[i],
-+				EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- 			EVP_DigestInit_ex(s->s3->handshake_dgst[i],md,NULL);
- 			EVP_DigestUpdate(s->s3->handshake_dgst[i],hdata,hdatalen);
- 			} 
-@@ -672,6 +675,7 @@ static int ssl3_handshake_mac(SSL *s, in
- 		return 0;
- 	}	
- 	EVP_MD_CTX_init(&ctx);
-+	EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- 	EVP_MD_CTX_copy_ex(&ctx,d);
- 	n=EVP_MD_CTX_size(&ctx);
- 	if (n < 0)
-diff -up openssl-1.0.0f/ssl/s3_srvr.c.fips openssl-1.0.0f/ssl/s3_srvr.c
---- openssl-1.0.0f/ssl/s3_srvr.c.fips	2012-01-04 16:27:54.000000000 +0100
-+++ openssl-1.0.0f/ssl/s3_srvr.c	2012-01-05 13:22:30.000000000 +0100
-@@ -1770,6 +1770,8 @@ int ssl3_send_server_key_exchange(SSL *s
- 				j=0;
- 				for (num=2; num > 0; num--)
- 					{
-+					EVP_MD_CTX_set_flags(&md_ctx,
-+						EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- 					EVP_DigestInit_ex(&md_ctx,(num == 2)
- 						?s->ctx->md5:s->ctx->sha1, NULL);
- 					EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-diff -up openssl-1.0.0f/ssl/t1_enc.c.fips openssl-1.0.0f/ssl/t1_enc.c
---- openssl-1.0.0f/ssl/t1_enc.c.fips	2010-06-15 19:25:15.000000000 +0200
-+++ openssl-1.0.0f/ssl/t1_enc.c	2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/ssl/t1_enc.c.fips openssl-1.0.0k/ssl/t1_enc.c
+--- openssl-1.0.0k/ssl/t1_enc.c.fips	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/t1_enc.c	2013-02-19 20:12:54.610665161 +0100
 @@ -170,6 +170,8 @@ static int tls1_P_hash(const EVP_MD *md,
  
  	HMAC_CTX_init(&ctx);
diff --git a/openssl-1.0.0d-intelopts.patch b/openssl-1.0.0k-intelopts.patch
similarity index 97%
rename from openssl-1.0.0d-intelopts.patch
rename to openssl-1.0.0k-intelopts.patch
index 6aba7b3..0cf1852 100644
--- a/openssl-1.0.0d-intelopts.patch
+++ b/openssl-1.0.0k-intelopts.patch
@@ -1,233 +1,254 @@
-diff -up openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl.intelopts openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl
---- openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl.intelopts	2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl	2011-08-24 12:50:56.000000000 +0200
+diff -up openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl.intelopts openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl
+--- openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl.intelopts	2013-02-19 21:15:39.391403202 +0100
++++ openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl	2013-02-19 21:15:39.427403937 +0100
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env perl
 +#!/usr/bin/perl
- 
+ #
  # ====================================================================
  # Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -11,10 +11,37 @@
+@@ -11,6 +11,145 @@
  # OpenSSL context it's used with Intel engine, but can also be used as
- # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+ # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
  # details].
 +#
 +# Performance.
 +#
-+# To start with see corresponding paragraph in aesni-x86_64.pl...
-+# Instead of filling table similar to one found there I've chosen to
-+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
-+# The simplified table below represents 32-bit performance relative
-+# to 64-bit one in every given point. Ratios vary for different
-+# encryption modes, therefore interval values.
++# Given aes(enc|dec) instructions' latency asymptotic performance for
++# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
++# processed with 128-bit key. And given their throughput asymptotic
++# performance for parallelizable modes is 1.25 cycles per byte. Being
++# asymptotic limit it's not something you commonly achieve in reality,
++# but how close does one get? Below are results collected for
++# different modes and block sized. Pairs of numbers are for en-/
++# decryption.
 +#
 +#	16-byte     64-byte     256-byte    1-KB        8-KB
-+#	53-67%      67-84%      91-94%      95-98%      97-99.5%
++# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
++# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
++# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
++# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
++# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
++# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
 +#
-+# Lower ratios for smaller block sizes are perfectly understandable,
-+# because function call overhead is higher in 32-bit mode. Largest
-+# 8-KB block performance is virtually same: 32-bit code is less than
-+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
++# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
++# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
++# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
++# The results were collected with specially crafted speed.c benchmark
++# in order to compare them with results reported in "Intel Advanced
++# Encryption Standard (AES) New Instruction Set" White Paper Revision
++# 3.0 dated May 2010. All above results are consistently better. This
++# module also provides better performance for block sizes smaller than
++# 128 bytes in points *not* represented in the above table.
++#
++# Looking at the results for 8-KB buffer.
++#
++# CFB and OFB results are far from the limit, because implementation
++# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
++# single-block aesni_encrypt, which is not the most optimal way to go.
++# CBC encrypt result is unexpectedly high and there is no documented
++# explanation for it. Seemingly there is a small penalty for feeding
++# the result back to AES unit the way it's done in CBC mode. There is
++# nothing one can do and the result appears optimal. CCM result is
++# identical to CBC, because CBC-MAC is essentially CBC encrypt without
++# saving output. CCM CTR "stays invisible," because it's neatly
++# interleaved wih CBC-MAC. This provides ~30% improvement over
++# "straghtforward" CCM implementation with CTR and CBC-MAC performed
++# disjointly. Parallelizable modes practically achieve the theoretical
++# limit.
++#
++# Looking at how results vary with buffer size.
++#
++# Curves are practically saturated at 1-KB buffer size. In most cases
++# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
++# CTR curve doesn't follow this pattern and is "slowest" changing one
++# with "256-byte" result being 87% of "8-KB." This is because overhead
++# in CTR mode is most computationally intensive. Small-block CCM
++# decrypt is slower than encrypt, because first CTR and last CBC-MAC
++# iterations can't be interleaved.
++#
++# Results for 192- and 256-bit keys.
++#
++# EVP-free results were observed to scale perfectly with number of
++# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
++# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
++# are a tad smaller, because the above mentioned penalty biases all
++# results by same constant value. In similar way function call
++# overhead affects small-block performance, as well as OFB and CFB
++# results. Differences are not large, most common coefficients are
++# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
++# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
 +
 +# January 2011
 +#
-+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
-+# interleaves at most 6 aes[enc|dec] instructions, because there are
-+# not enough registers for 8x interleave [which should be optimal for
-+# Sandy Bridge]. Actually, performance results for 6x interleave
-+# factor presented in aesni-x86_64.pl (except for CTR) are for this
-+# module.
++# While Westmere processor features 6 cycles latency for aes[enc|dec]
++# instructions, which can be scheduled every second cycle, Sandy
++# Bridge spends 8 cycles per instruction, but it can schedule them
++# every cycle. This means that code targeting Westmere would perform
++# suboptimally on Sandy Bridge. Therefore this update.
++#
++# In addition, non-parallelizable CBC encrypt (as well as CCM) is
++# optimized. Relative improvement might appear modest, 8% on Westmere,
++# but in absolute terms it's 3.77 cycles per byte encrypted with
++# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
++# should be compared to asymptotic limits of 3.75 for Westmere and
++# 5.00 for Sandy Bridge. Actually, the fact that they get this close
++# to asymptotic limits is quite amazing. Indeed, the limit is
++# calculated as latency times number of rounds, 10 for 128-bit key,
++# and divided by 16, the number of bytes in block, or in other words
++# it accounts *solely* for aesenc instructions. But there are extra
++# instructions, and numbers so close to the asymptotic limits mean
++# that it's as if it takes as little as *one* additional cycle to
++# execute all of them. How is it possible? It is possible thanks to
++# out-of-order execution logic, which manages to overlap post-
++# processing of previous block, things like saving the output, with
++# actual encryption of current block, as well as pre-processing of
++# current block, things like fetching input and xor-ing it with
++# 0-round element of the key schedule, with actual encryption of
++# previous block. Keep this in mind...
++#
++# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
++# performance is achieved by interleaving instructions working on
++# independent blocks. In which case asymptotic limit for such modes
++# can be obtained by dividing above mentioned numbers by AES
++# instructions' interleave factor. Westmere can execute at most 3 
++# instructions at a time, meaning that optimal interleave factor is 3,
++# and that's where the "magic" number of 1.25 come from. "Optimal
++# interleave factor" means that increase of interleave factor does
++# not improve performance. The formula has proven to reflect reality
++# pretty well on Westmere... Sandy Bridge on the other hand can
++# execute up to 8 AES instructions at a time, so how does varying
++# interleave factor affect the performance? Here is table for ECB
++# (numbers are cycles per byte processed with 128-bit key):
++#
++# instruction interleave factor		3x	6x	8x
++# theoretical asymptotic limit		1.67	0.83	0.625
++# measured performance for 8KB block	1.05	0.86	0.84
++#
++# "as if" interleave factor		4.7x	5.8x	6.0x
++#
++# Further data for other parallelizable modes:
++#
++# CBC decrypt				1.16	0.93	0.93
++# CTR					1.14	0.91	n/a
++#
++# Well, given 3x column it's probably inappropriate to call the limit
++# asymptotic, if it can be surpassed, isn't it? What happens there?
++# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
++# magic is responsible for this. Processor overlaps not only the
++# additional instructions with AES ones, but even AES instuctions
++# processing adjacent triplets of independent blocks. In the 6x case
++# additional instructions  still claim disproportionally small amount
++# of additional cycles, but in 8x case number of instructions must be
++# a tad too high for out-of-order logic to cope with, and AES unit
++# remains underutilized... As you can see 8x interleave is hardly
++# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
++# utilizies 6x interleave because of limited register bank capacity.
++#
++# Higher interleave factors do have negative impact on Westmere
++# performance. While for ECB mode it's negligible ~1.5%, other
++# parallelizables perform ~5% worse, which is outweighed by ~25%
++# improvement on Sandy Bridge. To balance regression on Westmere
++# CTR mode was implemented with 6x aesenc interleave factor.
  
  $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
  			# generates drop-in replacement for
- 			# crypto/aes/asm/aes-586.pl:-)
-+$inline=1;		# inline _aesni_[en|de]crypt
- 
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- push(@INC,"${dir}","${dir}../../perlasm");
-@@ -22,7 +49,8 @@ require "x86asm.pl";
- 
- &asm_init($ARGV[0],$0);
+@@ -29,7 +168,7 @@ die "can't locate x86_64-xlate.pl";
  
--$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
-+if ($PREFIX eq "aesni")	{ $movekey=*movups; }
-+else			{ $movekey=*movups; }
+ open STDOUT,"| $^X $xlate $flavour $output";
  
- $len="eax";
- $rounds="ecx";
-@@ -32,114 +60,144 @@ $out="edi";
- $rounds_="ebx";	# backup copy for $rounds
- $key_="ebp";	# backup copy for $key
+-$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
++$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+ @_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+ 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
  
--$inout0="xmm0";
--$inout1="xmm1";
--$inout2="xmm2";
--$rndkey0="xmm3";
--$rndkey1="xmm4";
--$ivec="xmm5";
--$in0="xmm6";
--$in1="xmm7";	$inout3="xmm7";
--
-+$rndkey0="xmm0";
-+$rndkey1="xmm1";
-+$inout0="xmm2";
-+$inout1="xmm3";
-+$inout2="xmm4";
-+$inout3="xmm5";	$in1="xmm5";
-+$inout4="xmm6";	$in0="xmm6";
-+$inout5="xmm7";	$ivec="xmm7";
-+
-+# AESNI extenstion
-+sub aeskeygenassist
-+{ my($dst,$src,$imm)=@_;
-+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
-+}
-+sub aescommon
-+{ my($opcodelet,$dst,$src)=@_;
-+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
-+}
-+sub aesimc	{ aescommon(0xdb, at _); }
-+sub aesenc	{ aescommon(0xdc, at _); }
-+sub aesenclast	{ aescommon(0xdd, at _); }
-+sub aesdec	{ aescommon(0xde, at _); }
-+sub aesdeclast	{ aescommon(0xdf, at _); }
-+
- # Inline version of internal aesni_[en|de]crypt1
-+{ my $sn;
- sub aesni_inline_generate1
--{ my $p=shift;
-+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
-+  $sn++;
+@@ -41,18 +180,20 @@ $inp="%rdi";
+ $out="%rsi";
+ $len="%rdx";
+ $key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
+-$ivp="%r8";	# cbc
++$ivp="%r8";	# cbc, ctr, ...
  
-     &$movekey		($rndkey0,&QWP(0,$key));
-     &$movekey		($rndkey1,&QWP(16,$key));
-+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
-     &lea		($key,&DWP(32,$key));
--    &pxor		($inout0,$rndkey0);
--    &set_label("${p}1_loop");
--	eval"&aes${p}	($inout0,$rndkey1)";
-+    &xorps		($inout,$ivec)		if (defined($ivec));
-+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
-+    &set_label("${p}1_loop_$sn");
-+	eval"&aes${p}	($inout,$rndkey1)";
- 	&dec		($rounds);
- 	&$movekey	($rndkey1,&QWP(0,$key));
- 	&lea		($key,&DWP(16,$key));
--    &jnz		(&label("${p}1_loop"));
--    eval"&aes${p}last	($inout0,$rndkey1)";
--}
-+    &jnz		(&label("${p}1_loop_$sn"));
-+    eval"&aes${p}last	($inout,$rndkey1)";
-+}}
+ $rnds_="%r10d";	# backup copy for $rounds
+ $key_="%r11";	# backup copy for $key
  
- sub aesni_generate1	# fully unrolled loop
--{ my $p=shift;
-+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+ # %xmm register layout
+-$inout0="%xmm0";	$inout1="%xmm1";
+-$inout2="%xmm2";	$inout3="%xmm3";
+-$rndkey0="%xmm4";	$rndkey1="%xmm5";
++$rndkey0="%xmm0";	$rndkey1="%xmm1";
++$inout0="%xmm2";	$inout1="%xmm3";
++$inout2="%xmm4";	$inout3="%xmm5";
++$inout4="%xmm6";	$inout5="%xmm7";
++$inout6="%xmm8";	$inout7="%xmm9";
  
-     &function_begin_B("_aesni_${p}rypt1");
--	&$movekey	($rndkey0,&QWP(0,$key));
-+	&movups		($rndkey0,&QWP(0,$key));
- 	&$movekey	($rndkey1,&QWP(0x10,$key));
--	&cmp		($rounds,11);
--	&pxor		($inout0,$rndkey0);
-+	&xorps		($inout,$rndkey0);
- 	&$movekey	($rndkey0,&QWP(0x20,$key));
- 	&lea		($key,&DWP(0x30,$key));
-+	&cmp		($rounds,11);
- 	&jb		(&label("${p}128"));
- 	&lea		($key,&DWP(0x20,$key));
- 	&je		(&label("${p}192"));
- 	&lea		($key,&DWP(0x20,$key));
--	eval"&aes${p}	($inout0,$rndkey1)";
-+	eval"&aes${p}	($inout,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(-0x40,$key));
--	eval"&aes${p}	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout,$rndkey0)";
- 	&$movekey	($rndkey0,&QWP(-0x30,$key));
-     &set_label("${p}192");
--	eval"&aes${p}	($inout0,$rndkey1)";
-+	eval"&aes${p}	($inout,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(-0x20,$key));
--	eval"&aes${p}	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout,$rndkey0)";
- 	&$movekey	($rndkey0,&QWP(-0x10,$key));
-     &set_label("${p}128");
--	eval"&aes${p}	($inout0,$rndkey1)";
-+	eval"&aes${p}	($inout,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(0,$key));
--	eval"&aes${p}	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout,$rndkey0)";
- 	&$movekey	($rndkey0,&QWP(0x10,$key));
--	eval"&aes${p}	($inout0,$rndkey1)";
-+	eval"&aes${p}	($inout,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(0x20,$key));
--	eval"&aes${p}	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout,$rndkey0)";
- 	&$movekey	($rndkey0,&QWP(0x30,$key));
--	eval"&aes${p}	($inout0,$rndkey1)";
-+	eval"&aes${p}	($inout,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(0x40,$key));
--	eval"&aes${p}	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout,$rndkey0)";
- 	&$movekey	($rndkey0,&QWP(0x50,$key));
--	eval"&aes${p}	($inout0,$rndkey1)";
-+	eval"&aes${p}	($inout,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(0x60,$key));
--	eval"&aes${p}	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout,$rndkey0)";
- 	&$movekey	($rndkey0,&QWP(0x70,$key));
--	eval"&aes${p}	($inout0,$rndkey1)";
--    eval"&aes${p}last	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout,$rndkey1)";
-+    eval"&aes${p}last	($inout,$rndkey0)";
-     &ret();
-     &function_end_B("_aesni_${p}rypt1");
+-$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt
+-$in1="%xmm8";		$in2="%xmm9";
++$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
++$in0="%xmm8";		$iv="%xmm9";
+ 
+ # Inline version of internal aesni_[en|de]crypt1.
+ #
+@@ -60,20 +201,29 @@ $in1="%xmm8";		$in2="%xmm9";
+ # cycles which take care of loop variables...
+ { my $sn;
+ sub aesni_generate1 {
+-my ($p,$key,$rounds)=@_;
++my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
+ ++$sn;
+ $code.=<<___;
+ 	$movkey	($key),$rndkey0
+ 	$movkey	16($key),$rndkey1
++___
++$code.=<<___ if (defined($ivec));
++	xorps	$rndkey0,$ivec
+ 	lea	32($key),$key
+-	pxor	$rndkey0,$inout0
++	xorps	$ivec,$inout
++___
++$code.=<<___ if (!defined($ivec));
++	lea	32($key),$key
++	xorps	$rndkey0,$inout
++___
++$code.=<<___;
+ .Loop_${p}1_$sn:
+-	aes${p}	$rndkey1,$inout0
++	aes${p}	$rndkey1,$inout
+ 	dec	$rounds
+ 	$movkey	($key),$rndkey1
+ 	lea	16($key),$key
+ 	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
+-	aes${p}last	$rndkey1,$inout0
++	aes${p}last	$rndkey1,$inout
+ ___
+ }}
+ # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+@@ -86,7 +236,7 @@ $code.=<<___;
+ .align	16
+ ${PREFIX}_encrypt:
+ 	movups	($inp),$inout0		# load input
+-	mov	240($key),$rounds	# pull $rounds
++	mov	240($key),$rounds	# key->rounds
+ ___
+ 	&aesni_generate1("enc",$key,$rounds);
+ $code.=<<___;
+@@ -99,7 +249,7 @@ $code.=<<___;
+ .align	16
+ ${PREFIX}_decrypt:
+ 	movups	($inp),$inout0		# load input
+-	mov	240($key),$rounds	# pull $rounds
++	mov	240($key),$rounds	# key->rounds
+ ___
+ 	&aesni_generate1("dec",$key,$rounds);
+ $code.=<<___;
+@@ -109,16 +259,16 @@ $code.=<<___;
+ ___
  }
--
-+
- # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
--# &aesni_generate1("dec");
-+&aesni_generate1("enc") if (!$inline);
- &function_begin_B("${PREFIX}_encrypt");
- 	&mov	("eax",&wparam(0));
- 	&mov	($key,&wparam(2));
- 	&movups	($inout0,&QWP(0,"eax"));
- 	&mov	($rounds,&DWP(240,$key));
- 	&mov	("eax",&wparam(1));
--	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt1");
-+	if ($inline)
-+	{   &aesni_inline_generate1("enc");	}
-+	else
-+	{   &call	("_aesni_encrypt1");	}
- 	&movups	(&QWP(0,"eax"),$inout0);
- 	&ret	();
- &function_end_B("${PREFIX}_encrypt");
- 
- # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
--# &aesni_generate1("dec");
-+&aesni_generate1("dec") if(!$inline);
- &function_begin_B("${PREFIX}_decrypt");
- 	&mov	("eax",&wparam(0));
- 	&mov	($key,&wparam(2));
- 	&movups	($inout0,&QWP(0,"eax"));
- 	&mov	($rounds,&DWP(240,$key));
- 	&mov	("eax",&wparam(1));
--	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt1");
-+	if ($inline)
-+	{   &aesni_inline_generate1("dec");	}
-+	else
-+	{   &call	("_aesni_decrypt1");	}
- 	&movups	(&QWP(0,"eax"),$inout0);
- 	&ret	();
- &function_end_B("${PREFIX}_decrypt");
--
+ 
 -# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
 -# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
 -# latency is 6, it turned out that it can be scheduled only every
 -# *second* cycle. Thus 3x interleave is the one providing optimal
-+
 +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
 +# factor. Why 3x subroutine were originally used in loops? Even though
 +# aes[enc|dec] latency was originally 6, it could be scheduled only
@@ -241,2081 +262,2060 @@ diff -up openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl.intelopts openssl-1.0.0d/cry
 +# This is why it makes no sense to implement 2x subroutine.
 +# aes[enc|dec] latency in next processor generation is 8, but the
 +# instructions can be scheduled every cycle. Optimal interleave for
-+# new processor is therefore 8x, but it's unfeasible to accommodate it
-+# in XMM registers addreassable in 32-bit mode and therefore 6x is
-+# used instead...
-+
- sub aesni_generate3
- { my $p=shift;
++# new processor is therefore 8x...
+ sub aesni_generate3 {
+ my $dir=shift;
+ # As already mentioned it takes in $key and $rounds, which are *not*
+@@ -131,25 +281,25 @@ _aesni_${dir}rypt3:
+ 	shr	\$1,$rounds
+ 	$movkey	16($key),$rndkey1
+ 	lea	32($key),$key
+-	pxor	$rndkey0,$inout0
+-	pxor	$rndkey0,$inout1
+-	pxor	$rndkey0,$inout2
++	xorps	$rndkey0,$inout0
++	xorps	$rndkey0,$inout1
++	xorps	$rndkey0,$inout2
++	$movkey		($key),$rndkey0
  
-@@ -148,24 +206,24 @@ sub aesni_generate3
- 	&shr		($rounds,1);
- 	&$movekey	($rndkey1,&QWP(16,$key));
- 	&lea		($key,&DWP(32,$key));
--	&pxor		($inout0,$rndkey0);
-+	&xorps		($inout0,$rndkey0);
- 	&pxor		($inout1,$rndkey0);
- 	&pxor		($inout2,$rndkey0);
--	&jmp		(&label("${p}3_loop"));
--    &set_label("${p}3_loop",16);
--	eval"&aes${p}	($inout0,$rndkey1)";
- 	&$movekey	($rndkey0,&QWP(0,$key));
-+
-+    &set_label("${p}3_loop");
-+	eval"&aes${p}	($inout0,$rndkey1)";
- 	eval"&aes${p}	($inout1,$rndkey1)";
- 	&dec		($rounds);
- 	eval"&aes${p}	($inout2,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(16,$key));
- 	eval"&aes${p}	($inout0,$rndkey0)";
--	&lea		($key,&DWP(32,$key));
- 	eval"&aes${p}	($inout1,$rndkey0)";
-+	&lea		($key,&DWP(32,$key));
- 	eval"&aes${p}	($inout2,$rndkey0)";
-+	&$movekey	($rndkey0,&QWP(0,$key));
- 	&jnz		(&label("${p}3_loop"));
-     eval"&aes${p}	($inout0,$rndkey1)";
--    &$movekey		($rndkey0,&QWP(0,$key));
-     eval"&aes${p}	($inout1,$rndkey1)";
-     eval"&aes${p}	($inout2,$rndkey1)";
-     eval"&aes${p}last	($inout0,$rndkey0)";
-@@ -187,27 +245,28 @@ sub aesni_generate4
- 	&$movekey	($rndkey1,&QWP(16,$key));
- 	&shr		($rounds,1);
- 	&lea		($key,&DWP(32,$key));
--	&pxor		($inout0,$rndkey0);
-+	&xorps		($inout0,$rndkey0);
- 	&pxor		($inout1,$rndkey0);
- 	&pxor		($inout2,$rndkey0);
- 	&pxor		($inout3,$rndkey0);
--	&jmp		(&label("${p}3_loop"));
--    &set_label("${p}3_loop",16);
--	eval"&aes${p}	($inout0,$rndkey1)";
- 	&$movekey	($rndkey0,&QWP(0,$key));
-+
-+    &set_label("${p}4_loop");
-+	eval"&aes${p}	($inout0,$rndkey1)";
- 	eval"&aes${p}	($inout1,$rndkey1)";
- 	&dec		($rounds);
- 	eval"&aes${p}	($inout2,$rndkey1)";
- 	eval"&aes${p}	($inout3,$rndkey1)";
- 	&$movekey	($rndkey1,&QWP(16,$key));
- 	eval"&aes${p}	($inout0,$rndkey0)";
--	&lea		($key,&DWP(32,$key));
- 	eval"&aes${p}	($inout1,$rndkey0)";
-+	&lea		($key,&DWP(32,$key));
- 	eval"&aes${p}	($inout2,$rndkey0)";
- 	eval"&aes${p}	($inout3,$rndkey0)";
--	&jnz		(&label("${p}3_loop"));
-+	&$movekey	($rndkey0,&QWP(0,$key));
-+    &jnz		(&label("${p}4_loop"));
-+
-     eval"&aes${p}	($inout0,$rndkey1)";
--    &$movekey		($rndkey0,&QWP(0,$key));
-     eval"&aes${p}	($inout1,$rndkey1)";
-     eval"&aes${p}	($inout2,$rndkey1)";
-     eval"&aes${p}	($inout3,$rndkey1)";
-@@ -218,12 +277,76 @@ sub aesni_generate4
-     &ret();
-     &function_end_B("_aesni_${p}rypt4");
- }
-+
-+sub aesni_generate6
-+{ my $p=shift;
-+
-+    &function_begin_B("_aesni_${p}rypt6");
-+    &static_label("_aesni_${p}rypt6_enter");
-+	&$movekey	($rndkey0,&QWP(0,$key));
-+	&shr		($rounds,1);
-+	&$movekey	($rndkey1,&QWP(16,$key));
-+	&lea		($key,&DWP(32,$key));
-+	&xorps		($inout0,$rndkey0);
-+	&pxor		($inout1,$rndkey0);	# pxor does better here
-+	eval"&aes${p}	($inout0,$rndkey1)";
-+	&pxor		($inout2,$rndkey0);
-+	eval"&aes${p}	($inout1,$rndkey1)";
-+	&pxor		($inout3,$rndkey0);
-+	&dec		($rounds);
-+	eval"&aes${p}	($inout2,$rndkey1)";
-+	&pxor		($inout4,$rndkey0);
-+	eval"&aes${p}	($inout3,$rndkey1)";
-+	&pxor		($inout5,$rndkey0);
-+	eval"&aes${p}	($inout4,$rndkey1)";
-+	&$movekey	($rndkey0,&QWP(0,$key));
-+	eval"&aes${p}	($inout5,$rndkey1)";
-+	&jmp		(&label("_aesni_${p}rypt6_enter"));
+ .L${dir}_loop3:
+ 	aes${dir}	$rndkey1,$inout0
+-	$movkey		($key),$rndkey0
+ 	aes${dir}	$rndkey1,$inout1
+ 	dec		$rounds
+ 	aes${dir}	$rndkey1,$inout2
+-	aes${dir}	$rndkey0,$inout0
+ 	$movkey		16($key),$rndkey1
++	aes${dir}	$rndkey0,$inout0
+ 	aes${dir}	$rndkey0,$inout1
+ 	lea		32($key),$key
+ 	aes${dir}	$rndkey0,$inout2
++	$movkey		($key),$rndkey0
+ 	jnz		.L${dir}_loop3
+ 
+ 	aes${dir}	$rndkey1,$inout0
+-	$movkey		($key),$rndkey0
+ 	aes${dir}	$rndkey1,$inout1
+ 	aes${dir}	$rndkey1,$inout2
+ 	aes${dir}last	$rndkey0,$inout0
+@@ -175,28 +325,28 @@ _aesni_${dir}rypt4:
+ 	shr	\$1,$rounds
+ 	$movkey	16($key),$rndkey1
+ 	lea	32($key),$key
+-	pxor	$rndkey0,$inout0
+-	pxor	$rndkey0,$inout1
+-	pxor	$rndkey0,$inout2
+-	pxor	$rndkey0,$inout3
++	xorps	$rndkey0,$inout0
++	xorps	$rndkey0,$inout1
++	xorps	$rndkey0,$inout2
++	xorps	$rndkey0,$inout3
++	$movkey	($key),$rndkey0
+ 
+ .L${dir}_loop4:
+ 	aes${dir}	$rndkey1,$inout0
+-	$movkey		($key),$rndkey0
+ 	aes${dir}	$rndkey1,$inout1
+ 	dec		$rounds
+ 	aes${dir}	$rndkey1,$inout2
+ 	aes${dir}	$rndkey1,$inout3
+-	aes${dir}	$rndkey0,$inout0
+ 	$movkey		16($key),$rndkey1
++	aes${dir}	$rndkey0,$inout0
+ 	aes${dir}	$rndkey0,$inout1
+ 	lea		32($key),$key
+ 	aes${dir}	$rndkey0,$inout2
+ 	aes${dir}	$rndkey0,$inout3
++	$movkey		($key),$rndkey0
+ 	jnz		.L${dir}_loop4
+ 
+ 	aes${dir}	$rndkey1,$inout0
+-	$movkey		($key),$rndkey0
+ 	aes${dir}	$rndkey1,$inout1
+ 	aes${dir}	$rndkey1,$inout2
+ 	aes${dir}	$rndkey1,$inout3
+@@ -208,12 +358,158 @@ _aesni_${dir}rypt4:
+ .size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+ ___
+ }
++sub aesni_generate6 {
++my $dir=shift;
++# As already mentioned it takes in $key and $rounds, which are *not*
++# preserved. $inout[0-5] is cipher/clear text...
++$code.=<<___;
++.type	_aesni_${dir}rypt6,\@abi-omnipotent
++.align	16
++_aesni_${dir}rypt6:
++	$movkey		($key),$rndkey0
++	shr		\$1,$rounds
++	$movkey		16($key),$rndkey1
++	lea		32($key),$key
++	xorps		$rndkey0,$inout0
++	pxor		$rndkey0,$inout1
++	aes${dir}	$rndkey1,$inout0
++	pxor		$rndkey0,$inout2
++	aes${dir}	$rndkey1,$inout1
++	pxor		$rndkey0,$inout3
++	aes${dir}	$rndkey1,$inout2
++	pxor		$rndkey0,$inout4
++	aes${dir}	$rndkey1,$inout3
++	pxor		$rndkey0,$inout5
++	dec		$rounds
++	aes${dir}	$rndkey1,$inout4
++	$movkey		($key),$rndkey0
++	aes${dir}	$rndkey1,$inout5
++	jmp		.L${dir}_loop6_enter
++.align	16
++.L${dir}_loop6:
++	aes${dir}	$rndkey1,$inout0
++	aes${dir}	$rndkey1,$inout1
++	dec		$rounds
++	aes${dir}	$rndkey1,$inout2
++	aes${dir}	$rndkey1,$inout3
++	aes${dir}	$rndkey1,$inout4
++	aes${dir}	$rndkey1,$inout5
++.L${dir}_loop6_enter:				# happens to be 16-byte aligned
++	$movkey		16($key),$rndkey1
++	aes${dir}	$rndkey0,$inout0
++	aes${dir}	$rndkey0,$inout1
++	lea		32($key),$key
++	aes${dir}	$rndkey0,$inout2
++	aes${dir}	$rndkey0,$inout3
++	aes${dir}	$rndkey0,$inout4
++	aes${dir}	$rndkey0,$inout5
++	$movkey		($key),$rndkey0
++	jnz		.L${dir}_loop6
 +
-+    &set_label("${p}6_loop",16);
-+	eval"&aes${p}	($inout0,$rndkey1)";
-+	eval"&aes${p}	($inout1,$rndkey1)";
-+	&dec		($rounds);
-+	eval"&aes${p}	($inout2,$rndkey1)";
-+	eval"&aes${p}	($inout3,$rndkey1)";
-+	eval"&aes${p}	($inout4,$rndkey1)";
-+	eval"&aes${p}	($inout5,$rndkey1)";
-+    &set_label("_aesni_${p}rypt6_enter",16);
-+	&$movekey	($rndkey1,&QWP(16,$key));
-+	eval"&aes${p}	($inout0,$rndkey0)";
-+	eval"&aes${p}	($inout1,$rndkey0)";
-+	&lea		($key,&DWP(32,$key));
-+	eval"&aes${p}	($inout2,$rndkey0)";
-+	eval"&aes${p}	($inout3,$rndkey0)";
-+	eval"&aes${p}	($inout4,$rndkey0)";
-+	eval"&aes${p}	($inout5,$rndkey0)";
-+	&$movekey	($rndkey0,&QWP(0,$key));
-+    &jnz		(&label("${p}6_loop"));
++	aes${dir}	$rndkey1,$inout0
++	aes${dir}	$rndkey1,$inout1
++	aes${dir}	$rndkey1,$inout2
++	aes${dir}	$rndkey1,$inout3
++	aes${dir}	$rndkey1,$inout4
++	aes${dir}	$rndkey1,$inout5
++	aes${dir}last	$rndkey0,$inout0
++	aes${dir}last	$rndkey0,$inout1
++	aes${dir}last	$rndkey0,$inout2
++	aes${dir}last	$rndkey0,$inout3
++	aes${dir}last	$rndkey0,$inout4
++	aes${dir}last	$rndkey0,$inout5
++	ret
++.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
++___
++}
++sub aesni_generate8 {
++my $dir=shift;
++# As already mentioned it takes in $key and $rounds, which are *not*
++# preserved. $inout[0-7] is cipher/clear text...
++$code.=<<___;
++.type	_aesni_${dir}rypt8,\@abi-omnipotent
++.align	16
++_aesni_${dir}rypt8:
++	$movkey		($key),$rndkey0
++	shr		\$1,$rounds
++	$movkey		16($key),$rndkey1
++	lea		32($key),$key
++	xorps		$rndkey0,$inout0
++	xorps		$rndkey0,$inout1
++	aes${dir}	$rndkey1,$inout0
++	pxor		$rndkey0,$inout2
++	aes${dir}	$rndkey1,$inout1
++	pxor		$rndkey0,$inout3
++	aes${dir}	$rndkey1,$inout2
++	pxor		$rndkey0,$inout4
++	aes${dir}	$rndkey1,$inout3
++	pxor		$rndkey0,$inout5
++	dec		$rounds
++	aes${dir}	$rndkey1,$inout4
++	pxor		$rndkey0,$inout6
++	aes${dir}	$rndkey1,$inout5
++	pxor		$rndkey0,$inout7
++	$movkey		($key),$rndkey0
++	aes${dir}	$rndkey1,$inout6
++	aes${dir}	$rndkey1,$inout7
++	$movkey		16($key),$rndkey1
++	jmp		.L${dir}_loop8_enter
++.align	16
++.L${dir}_loop8:
++	aes${dir}	$rndkey1,$inout0
++	aes${dir}	$rndkey1,$inout1
++	dec		$rounds
++	aes${dir}	$rndkey1,$inout2
++	aes${dir}	$rndkey1,$inout3
++	aes${dir}	$rndkey1,$inout4
++	aes${dir}	$rndkey1,$inout5
++	aes${dir}	$rndkey1,$inout6
++	aes${dir}	$rndkey1,$inout7
++	$movkey		16($key),$rndkey1
++.L${dir}_loop8_enter:				# happens to be 16-byte aligned
++	aes${dir}	$rndkey0,$inout0
++	aes${dir}	$rndkey0,$inout1
++	lea		32($key),$key
++	aes${dir}	$rndkey0,$inout2
++	aes${dir}	$rndkey0,$inout3
++	aes${dir}	$rndkey0,$inout4
++	aes${dir}	$rndkey0,$inout5
++	aes${dir}	$rndkey0,$inout6
++	aes${dir}	$rndkey0,$inout7
++	$movkey		($key),$rndkey0
++	jnz		.L${dir}_loop8
 +
-+    eval"&aes${p}	($inout0,$rndkey1)";
-+    eval"&aes${p}	($inout1,$rndkey1)";
-+    eval"&aes${p}	($inout2,$rndkey1)";
-+    eval"&aes${p}	($inout3,$rndkey1)";
-+    eval"&aes${p}	($inout4,$rndkey1)";
-+    eval"&aes${p}	($inout5,$rndkey1)";
-+    eval"&aes${p}last	($inout0,$rndkey0)";
-+    eval"&aes${p}last	($inout1,$rndkey0)";
-+    eval"&aes${p}last	($inout2,$rndkey0)";
-+    eval"&aes${p}last	($inout3,$rndkey0)";
-+    eval"&aes${p}last	($inout4,$rndkey0)";
-+    eval"&aes${p}last	($inout5,$rndkey0)";
-+    &ret();
-+    &function_end_B("_aesni_${p}rypt6");
++	aes${dir}	$rndkey1,$inout0
++	aes${dir}	$rndkey1,$inout1
++	aes${dir}	$rndkey1,$inout2
++	aes${dir}	$rndkey1,$inout3
++	aes${dir}	$rndkey1,$inout4
++	aes${dir}	$rndkey1,$inout5
++	aes${dir}	$rndkey1,$inout6
++	aes${dir}	$rndkey1,$inout7
++	aes${dir}last	$rndkey0,$inout0
++	aes${dir}last	$rndkey0,$inout1
++	aes${dir}last	$rndkey0,$inout2
++	aes${dir}last	$rndkey0,$inout3
++	aes${dir}last	$rndkey0,$inout4
++	aes${dir}last	$rndkey0,$inout5
++	aes${dir}last	$rndkey0,$inout6
++	aes${dir}last	$rndkey0,$inout7
++	ret
++.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
++___
 +}
  &aesni_generate3("enc") if ($PREFIX eq "aesni");
  &aesni_generate3("dec");
  &aesni_generate4("enc") if ($PREFIX eq "aesni");
  &aesni_generate4("dec");
--
 +&aesni_generate6("enc") if ($PREFIX eq "aesni");
 +&aesni_generate6("dec");
-+
++&aesni_generate8("enc") if ($PREFIX eq "aesni");
++&aesni_generate8("dec");
+ 
  if ($PREFIX eq "aesni") {
-+######################################################################
++########################################################################
  # void aesni_ecb_encrypt (const void *in, void *out,
- #                         size_t length, const AES_KEY *key,
- #                         int enc);
-@@ -232,62 +355,93 @@ if ($PREFIX eq "aesni") {
- 	&mov	($out,&wparam(1));
- 	&mov	($len,&wparam(2));
- 	&mov	($key,&wparam(3));
--	&mov	($rounds,&wparam(4));
--	&cmp	($len,16);
--	&jb	(&label("ecb_ret"));
-+	&mov	($rounds_,&wparam(4));
- 	&and	($len,-16);
--	&test	($rounds,$rounds)
-+	&jz	(&label("ecb_ret"));
- 	&mov	($rounds,&DWP(240,$key));
-+	&test	($rounds_,$rounds_);
-+	&jz	(&label("ecb_decrypt"));
-+
- 	&mov	($key_,$key);		# backup $key
- 	&mov	($rounds_,$rounds);	# backup $rounds
--	&jz	(&label("ecb_decrypt"));
-+	&cmp	($len,0x60);
-+	&jb	(&label("ecb_enc_tail"));
- 
--	&sub	($len,0x40);
--	&jbe	(&label("ecb_enc_tail"));
--	&jmp	(&label("ecb_enc_loop3"));
-+	&movdqu	($inout0,&QWP(0,$inp));
-+	&movdqu	($inout1,&QWP(0x10,$inp));
-+	&movdqu	($inout2,&QWP(0x20,$inp));
-+	&movdqu	($inout3,&QWP(0x30,$inp));
-+	&movdqu	($inout4,&QWP(0x40,$inp));
-+	&movdqu	($inout5,&QWP(0x50,$inp));
-+	&lea	($inp,&DWP(0x60,$inp));
-+	&sub	($len,0x60);
-+	&jmp	(&label("ecb_enc_loop6_enter"));
+ #			  size_t length, const AES_KEY *key,
+ #			  int enc);
+@@ -222,54 +518,98 @@ $code.=<<___;
+ .type	aesni_ecb_encrypt,\@function,5
+ .align	16
+ aesni_ecb_encrypt:
+-	cmp	\$16,$len		# check length
+-	jb	.Lecb_ret
+-
+-	mov	240($key),$rounds	# pull $rounds
+ 	and	\$-16,$len
++	jz	.Lecb_ret
 +
-+&set_label("ecb_enc_loop6",16);
-+	&movups	(&QWP(0,$out),$inout0);
-+	&movdqu	($inout0,&QWP(0,$inp));
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&movdqu	($inout1,&QWP(0x10,$inp));
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&movdqu	($inout2,&QWP(0x20,$inp));
-+	&movups	(&QWP(0x30,$out),$inout3);
-+	&movdqu	($inout3,&QWP(0x30,$inp));
-+	&movups	(&QWP(0x40,$out),$inout4);
-+	&movdqu	($inout4,&QWP(0x40,$inp));
-+	&movups	(&QWP(0x50,$out),$inout5);
-+	&lea	($out,&DWP(0x60,$out));
-+	&movdqu	($inout5,&QWP(0x50,$inp));
-+	&lea	($inp,&DWP(0x60,$inp));
-+&set_label("ecb_enc_loop6_enter");
++	mov	240($key),$rounds	# key->rounds
++	$movkey	($key),$rndkey0
+ 	mov	$key,$key_		# backup $key
+-	test	%r8d,%r8d		# 5th argument
+ 	mov	$rounds,$rnds_		# backup $rounds
++	test	%r8d,%r8d		# 5th argument
+ 	jz	.Lecb_decrypt
+ #--------------------------- ECB ENCRYPT ------------------------------#
+-	sub	\$0x40,$len
+-	jbe	.Lecb_enc_tail
+-	jmp	.Lecb_enc_loop3
++	cmp	\$0x80,$len
++	jb	.Lecb_enc_tail
 +
-+	&call	("_aesni_encrypt6");
- 
--&set_label("ecb_enc_loop3",16);
--	&movups	($inout0,&QWP(0,$inp));
--	&movups	($inout1,&QWP(0x10,$inp));
--	&movups	($inout2,&QWP(0x20,$inp));
--	&call	("_aesni_encrypt3");
--	&sub	($len,0x30);
--	&lea	($inp,&DWP(0x30,$inp));
--	&lea	($out,&DWP(0x30,$out));
--	&movups	(&QWP(-0x30,$out),$inout0);
- 	&mov	($key,$key_);		# restore $key
--	&movups	(&QWP(-0x20,$out),$inout1);
- 	&mov	($rounds,$rounds_);	# restore $rounds
--	&movups	(&QWP(-0x10,$out),$inout2);
--	&ja	(&label("ecb_enc_loop3"));
-+	&sub	($len,0x60);
-+	&jnc	(&label("ecb_enc_loop6"));
- 
--&set_label("ecb_enc_tail");
--	&add	($len,0x40);
-+	&movups	(&QWP(0,$out),$inout0);
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&movups	(&QWP(0x30,$out),$inout3);
-+	&movups	(&QWP(0x40,$out),$inout4);
-+	&movups	(&QWP(0x50,$out),$inout5);
-+	&lea	($out,&DWP(0x60,$out));
-+	&add	($len,0x60);
- 	&jz	(&label("ecb_ret"));
- 
--	&cmp	($len,0x10);
-+&set_label("ecb_enc_tail");
- 	&movups	($inout0,&QWP(0,$inp));
--	&je	(&label("ecb_enc_one"));
- 	&cmp	($len,0x20);
-+	&jb	(&label("ecb_enc_one"));
- 	&movups	($inout1,&QWP(0x10,$inp));
- 	&je	(&label("ecb_enc_two"));
--	&cmp	($len,0x30);
- 	&movups	($inout2,&QWP(0x20,$inp));
--	&je	(&label("ecb_enc_three"));
-+	&cmp	($len,0x40);
-+	&jb	(&label("ecb_enc_three"));
- 	&movups	($inout3,&QWP(0x30,$inp));
--	&call	("_aesni_encrypt4");
-+	&je	(&label("ecb_enc_four"));
-+	&movups	($inout4,&QWP(0x40,$inp));
-+	&xorps	($inout5,$inout5);
-+	&call	("_aesni_encrypt6");
- 	&movups	(&QWP(0,$out),$inout0);
- 	&movups	(&QWP(0x10,$out),$inout1);
- 	&movups	(&QWP(0x20,$out),$inout2);
- 	&movups	(&QWP(0x30,$out),$inout3);
-+	&movups	(&QWP(0x40,$out),$inout4);
- 	jmp	(&label("ecb_ret"));
- 
- &set_label("ecb_enc_one",16);
--	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt1");
-+	if ($inline)
-+	{   &aesni_inline_generate1("enc");	}
-+	else
-+	{   &call	("_aesni_encrypt1");	}
- 	&movups	(&QWP(0,$out),$inout0);
- 	&jmp	(&label("ecb_ret"));
- 
- &set_label("ecb_enc_two",16);
-+	&xorps	($inout2,$inout2);
- 	&call	("_aesni_encrypt3");
- 	&movups	(&QWP(0,$out),$inout0);
- 	&movups	(&QWP(0x10,$out),$inout1);
-@@ -300,53 +454,95 @@ if ($PREFIX eq "aesni") {
- 	&movups	(&QWP(0x20,$out),$inout2);
- 	&jmp	(&label("ecb_ret"));
++	movdqu	($inp),$inout0
++	movdqu	0x10($inp),$inout1
++	movdqu	0x20($inp),$inout2
++	movdqu	0x30($inp),$inout3
++	movdqu	0x40($inp),$inout4
++	movdqu	0x50($inp),$inout5
++	movdqu	0x60($inp),$inout6
++	movdqu	0x70($inp),$inout7
++	lea	0x80($inp),$inp
++	sub	\$0x80,$len
++	jmp	.Lecb_enc_loop8_enter
+ .align 16
+-.Lecb_enc_loop3:
+-	movups	($inp),$inout0
+-	movups	0x10($inp),$inout1
+-	movups	0x20($inp),$inout2
+-	call	_aesni_encrypt3
+-	sub	\$0x30,$len
+-	lea	0x30($inp),$inp
+-	lea	0x30($out),$out
+-	movups	$inout0,-0x30($out)
+-	mov	$rnds_,$rounds		# restore $rounds
+-	movups	$inout1,-0x20($out)
++.Lecb_enc_loop8:
++	movups	$inout0,($out)
+ 	mov	$key_,$key		# restore $key
+-	movups	$inout2,-0x10($out)
+-	ja	.Lecb_enc_loop3
++	movdqu	($inp),$inout0
++	mov	$rnds_,$rounds		# restore $rounds
++	movups	$inout1,0x10($out)
++	movdqu	0x10($inp),$inout1
++	movups	$inout2,0x20($out)
++	movdqu	0x20($inp),$inout2
++	movups	$inout3,0x30($out)
++	movdqu	0x30($inp),$inout3
++	movups	$inout4,0x40($out)
++	movdqu	0x40($inp),$inout4
++	movups	$inout5,0x50($out)
++	movdqu	0x50($inp),$inout5
++	movups	$inout6,0x60($out)
++	movdqu	0x60($inp),$inout6
++	movups	$inout7,0x70($out)
++	lea	0x80($out),$out
++	movdqu	0x70($inp),$inout7
++	lea	0x80($inp),$inp
++.Lecb_enc_loop8_enter:
  
-+&set_label("ecb_enc_four",16);
-+	&call	("_aesni_encrypt4");
-+	&movups	(&QWP(0,$out),$inout0);
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&movups	(&QWP(0x30,$out),$inout3);
-+	&jmp	(&label("ecb_ret"));
-+######################################################################
- &set_label("ecb_decrypt",16);
--	&sub	($len,0x40);
--	&jbe	(&label("ecb_dec_tail"));
--	&jmp	(&label("ecb_dec_loop3"));
-+	&mov	($key_,$key);		# backup $key
-+	&mov	($rounds_,$rounds);	# backup $rounds
-+	&cmp	($len,0x60);
-+	&jb	(&label("ecb_dec_tail"));
-+
-+	&movdqu	($inout0,&QWP(0,$inp));
-+	&movdqu	($inout1,&QWP(0x10,$inp));
-+	&movdqu	($inout2,&QWP(0x20,$inp));
-+	&movdqu	($inout3,&QWP(0x30,$inp));
-+	&movdqu	($inout4,&QWP(0x40,$inp));
-+	&movdqu	($inout5,&QWP(0x50,$inp));
-+	&lea	($inp,&DWP(0x60,$inp));
-+	&sub	($len,0x60);
-+	&jmp	(&label("ecb_dec_loop6_enter"));
+-.Lecb_enc_tail:
+-	add	\$0x40,$len
++	call	_aesni_encrypt8
 +
-+&set_label("ecb_dec_loop6",16);
-+	&movups	(&QWP(0,$out),$inout0);
-+	&movdqu	($inout0,&QWP(0,$inp));
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&movdqu	($inout1,&QWP(0x10,$inp));
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&movdqu	($inout2,&QWP(0x20,$inp));
-+	&movups	(&QWP(0x30,$out),$inout3);
-+	&movdqu	($inout3,&QWP(0x30,$inp));
-+	&movups	(&QWP(0x40,$out),$inout4);
-+	&movdqu	($inout4,&QWP(0x40,$inp));
-+	&movups	(&QWP(0x50,$out),$inout5);
-+	&lea	($out,&DWP(0x60,$out));
-+	&movdqu	($inout5,&QWP(0x50,$inp));
-+	&lea	($inp,&DWP(0x60,$inp));
-+&set_label("ecb_dec_loop6_enter");
++	sub	\$0x80,$len
++	jnc	.Lecb_enc_loop8
 +
-+	&call	("_aesni_decrypt6");
- 
--&set_label("ecb_dec_loop3",16);
--	&movups	($inout0,&QWP(0,$inp));
--	&movups	($inout1,&QWP(0x10,$inp));
--	&movups	($inout2,&QWP(0x20,$inp));
--	&call	("_aesni_decrypt3");
--	&sub	($len,0x30);
--	&lea	($inp,&DWP(0x30,$inp));
--	&lea	($out,&DWP(0x30,$out));
--	&movups	(&QWP(-0x30,$out),$inout0);
- 	&mov	($key,$key_);		# restore $key
--	&movups	(&QWP(-0x20,$out),$inout1);
- 	&mov	($rounds,$rounds_);	# restore $rounds
--	&movups	(&QWP(-0x10,$out),$inout2);
--	&ja	(&label("ecb_dec_loop3"));
-+	&sub	($len,0x60);
-+	&jnc	(&label("ecb_dec_loop6"));
- 
--&set_label("ecb_dec_tail");
--	&add	($len,0x40);
-+	&movups	(&QWP(0,$out),$inout0);
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&movups	(&QWP(0x30,$out),$inout3);
-+	&movups	(&QWP(0x40,$out),$inout4);
-+	&movups	(&QWP(0x50,$out),$inout5);
-+	&lea	($out,&DWP(0x60,$out));
-+	&add	($len,0x60);
- 	&jz	(&label("ecb_ret"));
- 
--	&cmp	($len,0x10);
-+&set_label("ecb_dec_tail");
- 	&movups	($inout0,&QWP(0,$inp));
--	&je	(&label("ecb_dec_one"));
- 	&cmp	($len,0x20);
-+	&jb	(&label("ecb_dec_one"));
- 	&movups	($inout1,&QWP(0x10,$inp));
- 	&je	(&label("ecb_dec_two"));
--	&cmp	($len,0x30);
- 	&movups	($inout2,&QWP(0x20,$inp));
--	&je	(&label("ecb_dec_three"));
-+	&cmp	($len,0x40);
-+	&jb	(&label("ecb_dec_three"));
- 	&movups	($inout3,&QWP(0x30,$inp));
--	&call	("_aesni_decrypt4");
-+	&je	(&label("ecb_dec_four"));
-+	&movups	($inout4,&QWP(0x40,$inp));
-+	&xorps	($inout5,$inout5);
-+	&call	("_aesni_decrypt6");
- 	&movups	(&QWP(0,$out),$inout0);
- 	&movups	(&QWP(0x10,$out),$inout1);
- 	&movups	(&QWP(0x20,$out),$inout2);
- 	&movups	(&QWP(0x30,$out),$inout3);
-+	&movups	(&QWP(0x40,$out),$inout4);
- 	&jmp	(&label("ecb_ret"));
- 
- &set_label("ecb_dec_one",16);
--	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt3");
-+	if ($inline)
-+	{   &aesni_inline_generate1("dec");	}
-+	else
-+	{   &call	("_aesni_decrypt1");	}
- 	&movups	(&QWP(0,$out),$inout0);
- 	&jmp	(&label("ecb_ret"));
++	movups	$inout0,($out)
++	mov	$key_,$key		# restore $key
++	movups	$inout1,0x10($out)
++	mov	$rnds_,$rounds		# restore $rounds
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	movups	$inout5,0x50($out)
++	movups	$inout6,0x60($out)
++	movups	$inout7,0x70($out)
++	lea	0x80($out),$out
++	add	\$0x80,$len
+ 	jz	.Lecb_ret
  
- &set_label("ecb_dec_two",16);
-+	&xorps	($inout2,$inout2);
- 	&call	("_aesni_decrypt3");
- 	&movups	(&QWP(0,$out),$inout0);
- 	&movups	(&QWP(0x10,$out),$inout1);
-@@ -357,28 +553,42 @@ if ($PREFIX eq "aesni") {
- 	&movups	(&QWP(0,$out),$inout0);
- 	&movups	(&QWP(0x10,$out),$inout1);
- 	&movups	(&QWP(0x20,$out),$inout2);
-+	&jmp	(&label("ecb_ret"));
+-	cmp	\$0x10,$len
++.Lecb_enc_tail:
+ 	movups	($inp),$inout0
+-	je	.Lecb_enc_one
+ 	cmp	\$0x20,$len
++	jb	.Lecb_enc_one
+ 	movups	0x10($inp),$inout1
+ 	je	.Lecb_enc_two
+-	cmp	\$0x30,$len
+ 	movups	0x20($inp),$inout2
+-	je	.Lecb_enc_three
++	cmp	\$0x40,$len
++	jb	.Lecb_enc_three
+ 	movups	0x30($inp),$inout3
+-	call	_aesni_encrypt4
++	je	.Lecb_enc_four
++	movups	0x40($inp),$inout4
++	cmp	\$0x60,$len
++	jb	.Lecb_enc_five
++	movups	0x50($inp),$inout5
++	je	.Lecb_enc_six
++	movdqu	0x60($inp),$inout6
++	call	_aesni_encrypt8
+ 	movups	$inout0,($out)
+ 	movups	$inout1,0x10($out)
+ 	movups	$inout2,0x20($out)
+ 	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	movups	$inout5,0x50($out)
++	movups	$inout6,0x60($out)
+ 	jmp	.Lecb_ret
+ .align	16
+ .Lecb_enc_one:
+@@ -280,6 +620,7 @@ $code.=<<___;
+ 	jmp	.Lecb_ret
+ .align	16
+ .Lecb_enc_two:
++	xorps	$inout2,$inout2
+ 	call	_aesni_encrypt3
+ 	movups	$inout0,($out)
+ 	movups	$inout1,0x10($out)
+@@ -291,47 +632,121 @@ $code.=<<___;
+ 	movups	$inout1,0x10($out)
+ 	movups	$inout2,0x20($out)
+ 	jmp	.Lecb_ret
++.align	16
++.Lecb_enc_four:
++	call	_aesni_encrypt4
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	jmp	.Lecb_ret
++.align	16
++.Lecb_enc_five:
++	xorps	$inout5,$inout5
++	call	_aesni_encrypt6
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	jmp	.Lecb_ret
++.align	16
++.Lecb_enc_six:
++	call	_aesni_encrypt6
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	movups	$inout5,0x50($out)
++	jmp	.Lecb_ret
+ #--------------------------- ECB DECRYPT ------------------------------#
+ .align	16
+ .Lecb_decrypt:
+-	sub	\$0x40,$len
+-	jbe	.Lecb_dec_tail
+-	jmp	.Lecb_dec_loop3
++	cmp	\$0x80,$len
++	jb	.Lecb_dec_tail
 +
-+&set_label("ecb_dec_four",16);
-+	&call	("_aesni_decrypt4");
-+	&movups	(&QWP(0,$out),$inout0);
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&movups	(&QWP(0x30,$out),$inout3);
- 
- &set_label("ecb_ret");
- &function_end("aesni_ecb_encrypt");
- }
++	movdqu	($inp),$inout0
++	movdqu	0x10($inp),$inout1
++	movdqu	0x20($inp),$inout2
++	movdqu	0x30($inp),$inout3
++	movdqu	0x40($inp),$inout4
++	movdqu	0x50($inp),$inout5
++	movdqu	0x60($inp),$inout6
++	movdqu	0x70($inp),$inout7
++	lea	0x80($inp),$inp
++	sub	\$0x80,$len
++	jmp	.Lecb_dec_loop8_enter
+ .align 16
+-.Lecb_dec_loop3:
+-	movups	($inp),$inout0
+-	movups	0x10($inp),$inout1
+-	movups	0x20($inp),$inout2
+-	call	_aesni_decrypt3
+-	sub	\$0x30,$len
+-	lea	0x30($inp),$inp
+-	lea	0x30($out),$out
+-	movups	$inout0,-0x30($out)
+-	mov	$rnds_,$rounds		# restore $rounds
+-	movups	$inout1,-0x20($out)
++.Lecb_dec_loop8:
++	movups	$inout0,($out)
+ 	mov	$key_,$key		# restore $key
+-	movups	$inout2,-0x10($out)
+-	ja	.Lecb_dec_loop3
++	movdqu	($inp),$inout0
++	mov	$rnds_,$rounds		# restore $rounds
++	movups	$inout1,0x10($out)
++	movdqu	0x10($inp),$inout1
++	movups	$inout2,0x20($out)
++	movdqu	0x20($inp),$inout2
++	movups	$inout3,0x30($out)
++	movdqu	0x30($inp),$inout3
++	movups	$inout4,0x40($out)
++	movdqu	0x40($inp),$inout4
++	movups	$inout5,0x50($out)
++	movdqu	0x50($inp),$inout5
++	movups	$inout6,0x60($out)
++	movdqu	0x60($inp),$inout6
++	movups	$inout7,0x70($out)
++	lea	0x80($out),$out
++	movdqu	0x70($inp),$inout7
++	lea	0x80($inp),$inp
++.Lecb_dec_loop8_enter:
++
++	call	_aesni_decrypt8
++
++	$movkey	($key_),$rndkey0
++	sub	\$0x80,$len
++	jnc	.Lecb_dec_loop8
  
-+######################################################################
- # void $PREFIX_cbc_encrypt (const void *inp, void *out,
- #                           size_t length, const AES_KEY *key,
- #                           unsigned char *ivp,const int enc);
- &function_begin("${PREFIX}_cbc_encrypt");
- 	&mov	($inp,&wparam(0));
-+	&mov	($rounds_,"esp");
- 	&mov	($out,&wparam(1));
-+	&sub	($rounds_,24);
- 	&mov	($len,&wparam(2));
-+	&and	($rounds_,-16);
- 	&mov	($key,&wparam(3));
--	&test	($len,$len);
- 	&mov	($key_,&wparam(4));
--	&jz	(&label("cbc_ret"));
-+	&test	($len,$len);
-+	&jz	(&label("cbc_abort"));
+-.Lecb_dec_tail:
+-	add	\$0x40,$len
++	movups	$inout0,($out)
++	mov	$key_,$key		# restore $key
++	movups	$inout1,0x10($out)
++	mov	$rnds_,$rounds		# restore $rounds
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	movups	$inout5,0x50($out)
++	movups	$inout6,0x60($out)
++	movups	$inout7,0x70($out)
++	lea	0x80($out),$out
++	add	\$0x80,$len
+ 	jz	.Lecb_ret
  
- 	&cmp	(&wparam(5),0);
--	&movups	($ivec,&QWP(0,$key_));	# load IV
-+	&xchg	($rounds_,"esp");		# alloca
-+	&movups	($ivec,&QWP(0,$key_));		# load IV
- 	&mov	($rounds,&DWP(240,$key));
--	&mov	($key_,$key);		# backup $key
--	&mov	($rounds_,$rounds);	# backup $rounds
-+	&mov	($key_,$key);			# backup $key
-+	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
-+	&mov	($rounds_,$rounds);		# backup $rounds
- 	&je	(&label("cbc_decrypt"));
+-	cmp	\$0x10,$len
++.Lecb_dec_tail:
+ 	movups	($inp),$inout0
+-	je	.Lecb_dec_one
+ 	cmp	\$0x20,$len
++	jb	.Lecb_dec_one
+ 	movups	0x10($inp),$inout1
+ 	je	.Lecb_dec_two
+-	cmp	\$0x30,$len
+ 	movups	0x20($inp),$inout2
+-	je	.Lecb_dec_three
++	cmp	\$0x40,$len
++	jb	.Lecb_dec_three
+ 	movups	0x30($inp),$inout3
+-	call	_aesni_decrypt4
++	je	.Lecb_dec_four
++	movups	0x40($inp),$inout4
++	cmp	\$0x60,$len
++	jb	.Lecb_dec_five
++	movups	0x50($inp),$inout5
++	je	.Lecb_dec_six
++	movups	0x60($inp),$inout6
++	$movkey	($key),$rndkey0
++	call	_aesni_decrypt8
+ 	movups	$inout0,($out)
+ 	movups	$inout1,0x10($out)
+ 	movups	$inout2,0x20($out)
+ 	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	movups	$inout5,0x50($out)
++	movups	$inout6,0x60($out)
+ 	jmp	.Lecb_ret
+ .align	16
+ .Lecb_dec_one:
+@@ -342,6 +757,7 @@ $code.=<<___;
+ 	jmp	.Lecb_ret
+ .align	16
+ .Lecb_dec_two:
++	xorps	$inout2,$inout2
+ 	call	_aesni_decrypt3
+ 	movups	$inout0,($out)
+ 	movups	$inout1,0x10($out)
+@@ -352,6 +768,34 @@ $code.=<<___;
+ 	movups	$inout0,($out)
+ 	movups	$inout1,0x10($out)
+ 	movups	$inout2,0x20($out)
++	jmp	.Lecb_ret
++.align	16
++.Lecb_dec_four:
++	call	_aesni_decrypt4
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	jmp	.Lecb_ret
++.align	16
++.Lecb_dec_five:
++	xorps	$inout5,$inout5
++	call	_aesni_decrypt6
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	jmp	.Lecb_ret
++.align	16
++.Lecb_dec_six:
++	call	_aesni_decrypt6
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	movups	$inout5,0x50($out)
  
- 	&movaps	($inout0,$ivec);
-@@ -388,15 +598,17 @@ if ($PREFIX eq "aesni") {
- 	&jmp	(&label("cbc_enc_loop"));
+ .Lecb_ret:
+ 	ret
+@@ -362,7 +806,8 @@ ___
+ # void $PREFIX_cbc_encrypt (const void *inp, void *out,
+ #			    size_t length, const AES_KEY *key,
+ #			    unsigned char *ivp,const int enc);
+-$reserved = $win64?0x40:-0x18;	# used in decrypt
++{
++my $reserved = $win64?0x40:-0x18;	# used in decrypt
+ $code.=<<___;
+ .globl	${PREFIX}_cbc_encrypt
+ .type	${PREFIX}_cbc_encrypt,\@function,6
+@@ -371,30 +816,30 @@ ${PREFIX}_cbc_encrypt:
+ 	test	$len,$len		# check length
+ 	jz	.Lcbc_ret
  
- &set_label("cbc_enc_loop",16);
--	&movups	($ivec,&QWP(0,$inp));
-+	&movups	($ivec,&QWP(0,$inp));		# input actually
- 	&lea	($inp,&DWP(16,$inp));
--	&pxor	($inout0,$ivec);
--	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt3");
--	&sub	($len,16);
--	&lea	($out,&DWP(16,$out));
-+	if ($inline)
-+	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
-+	else
-+	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
- 	&mov	($rounds,$rounds_);	# restore $rounds
- 	&mov	($key,$key_);		# restore $key
--	&movups	(&QWP(-16,$out),$inout0);
-+	&movups	(&QWP(0,$out),$inout0);	# store output
-+	&lea	($out,&DWP(16,$out));
-+	&sub	($len,16);
- 	&jnc	(&label("cbc_enc_loop"));
- 	&add	($len,16);
- 	&jnz	(&label("cbc_enc_tail"));
-@@ -415,90 +627,151 @@ if ($PREFIX eq "aesni") {
- 	&mov	($inp,$out);		# $inp and $out are the same
- 	&mov	($key,$key_);		# restore $key
- 	&jmp	(&label("cbc_enc_loop"));
--
-+######################################################################
- &set_label("cbc_decrypt",16);
--	&sub	($len,0x40);
-+	&cmp	($len,0x50);
- 	&jbe	(&label("cbc_dec_tail"));
--	&jmp	(&label("cbc_dec_loop3"));
-+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
-+	&sub	($len,0x50);
-+	&jmp	(&label("cbc_dec_loop6_enter"));
-+
-+&set_label("cbc_dec_loop6",16);
-+	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
-+	&movups	(&QWP(0,$out),$inout5);
-+	&lea	($out,&DWP(0x10,$out));
-+&set_label("cbc_dec_loop6_enter");
-+	&movdqu	($inout0,&QWP(0,$inp));
-+	&movdqu	($inout1,&QWP(0x10,$inp));
-+	&movdqu	($inout2,&QWP(0x20,$inp));
-+	&movdqu	($inout3,&QWP(0x30,$inp));
-+	&movdqu	($inout4,&QWP(0x40,$inp));
-+	&movdqu	($inout5,&QWP(0x50,$inp));
+-	mov	240($key),$rnds_	# pull $rounds
++	mov	240($key),$rnds_	# key->rounds
+ 	mov	$key,$key_		# backup $key
+ 	test	%r9d,%r9d		# 6th argument
+ 	jz	.Lcbc_decrypt
+ #--------------------------- CBC ENCRYPT ------------------------------#
+ 	movups	($ivp),$inout0		# load iv as initial state
+-	cmp	\$16,$len
+ 	mov	$rnds_,$rounds
++	cmp	\$16,$len
+ 	jb	.Lcbc_enc_tail
+ 	sub	\$16,$len
+ 	jmp	.Lcbc_enc_loop
+-.align 16
++.align	16
+ .Lcbc_enc_loop:
+ 	movups	($inp),$inout1		# load input
+ 	lea	16($inp),$inp
+-	pxor	$inout1,$inout0
++	#xorps	$inout1,$inout0
+ ___
+-	&aesni_generate1("enc",$key,$rounds);
++	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+ $code.=<<___;
+-	sub	\$16,$len
+-	lea	16($out),$out
+ 	mov	$rnds_,$rounds		# restore $rounds
+ 	mov	$key_,$key		# restore $key
+-	movups	$inout0,-16($out)	# store output
++	movups	$inout0,0($out)		# store output
++	lea	16($out),$out
++	sub	\$16,$len
+ 	jnc	.Lcbc_enc_loop
+ 	add	\$16,$len
+ 	jnz	.Lcbc_enc_tail
+@@ -429,92 +874,238 @@ $code.=<<___ if ($win64);
+ ___
+ $code.=<<___;
+ 	movups	($ivp),$iv
+-	sub	\$0x40,$len
+ 	mov	$rnds_,$rounds
++	cmp	\$0x70,$len
+ 	jbe	.Lcbc_dec_tail
+-	jmp	.Lcbc_dec_loop3
+-.align 16
+-.Lcbc_dec_loop3:
+-	movups	($inp),$inout0
++	shr	\$1,$rnds_
++	sub	\$0x70,$len
++	mov	$rnds_,$rounds
++	movaps	$iv,$reserved(%rsp)
++	jmp	.Lcbc_dec_loop8_enter
++.align	16
++.Lcbc_dec_loop8:
++	movaps	$rndkey0,$reserved(%rsp)	# save IV
++	movups	$inout7,($out)
++	lea	0x10($out),$out
++.Lcbc_dec_loop8_enter:
++	$movkey		($key),$rndkey0
++	movups	($inp),$inout0			# load input
+ 	movups	0x10($inp),$inout1
+-	movups	0x20($inp),$inout2
+-	movaps	$inout0,$in0
+-	movaps	$inout1,$in1
+-	movaps	$inout2,$in2
+-	call	_aesni_decrypt3
+-	sub	\$0x30,$len
+-	lea	0x30($inp),$inp
+-	lea	0x30($out),$out
+-	pxor	$iv,$inout0
+-	pxor	$in0,$inout1
+-	movaps	$in2,$iv
+-	pxor	$in1,$inout2
+-	movups	$inout0,-0x30($out)
+-	mov	$rnds_,$rounds	# restore $rounds
+-	movups	$inout1,-0x20($out)
+-	mov	$key_,$key	# restore $key
+-	movups	$inout2,-0x10($out)
+-	ja	.Lcbc_dec_loop3
++	$movkey		16($key),$rndkey1
  
--&set_label("cbc_dec_loop3",16);
--	&movups	($inout0,&QWP(0,$inp));
--	&movups	($inout1,&QWP(0x10,$inp));
--	&movups	($inout2,&QWP(0x20,$inp));
--	&movaps	($in0,$inout0);
--	&movaps	($in1,$inout1);
--	&call	("_aesni_decrypt3");
--	&sub	($len,0x30);
--	&lea	($inp,&DWP(0x30,$inp));
--	&lea	($out,&DWP(0x30,$out));
--	&pxor	($inout0,$ivec);
--	&pxor	($inout1,$in0);
--	&movups	($ivec,&QWP(-0x10,$inp));
--	&pxor	($inout2,$in1);
--	&movups	(&QWP(-0x30,$out),$inout0);
--	&mov	($rounds,$rounds_)	# restore $rounds
--	&movups	(&QWP(-0x20,$out),$inout1);
--	&mov	($key,$key_);		# restore $key
--	&movups	(&QWP(-0x10,$out),$inout2);
--	&ja	(&label("cbc_dec_loop3"));
-+	&call	("_aesni_decrypt6");
+-.Lcbc_dec_tail:
+-	add	\$0x40,$len
+-	movups	$iv,($ivp)
+-	jz	.Lcbc_dec_ret
++	lea		32($key),$key
++	movdqu	0x20($inp),$inout2
++	xorps		$rndkey0,$inout0
++	movdqu	0x30($inp),$inout3
++	xorps		$rndkey0,$inout1
++	movdqu	0x40($inp),$inout4
++	aesdec		$rndkey1,$inout0
++	pxor		$rndkey0,$inout2
++	movdqu	0x50($inp),$inout5
++	aesdec		$rndkey1,$inout1
++	pxor		$rndkey0,$inout3
++	movdqu	0x60($inp),$inout6
++	aesdec		$rndkey1,$inout2
++	pxor		$rndkey0,$inout4
++	movdqu	0x70($inp),$inout7
++	aesdec		$rndkey1,$inout3
++	pxor		$rndkey0,$inout5
++	dec		$rounds
++	aesdec		$rndkey1,$inout4
++	pxor		$rndkey0,$inout6
++	aesdec		$rndkey1,$inout5
++	pxor		$rndkey0,$inout7
++	$movkey		($key),$rndkey0
++	aesdec		$rndkey1,$inout6
++	aesdec		$rndkey1,$inout7
++	$movkey		16($key),$rndkey1
++
++	call		.Ldec_loop8_enter
  
-+	&movups	($rndkey1,&QWP(0,$inp));
-+	&movups	($rndkey0,&QWP(0x10,$inp));
-+	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
-+	&xorps	($inout1,$rndkey1);
-+	&movups	($rndkey1,&QWP(0x20,$inp));
-+	&xorps	($inout2,$rndkey0);
-+	&movups	($rndkey0,&QWP(0x30,$inp));
-+	&xorps	($inout3,$rndkey1);
-+	&movups	($rndkey1,&QWP(0x40,$inp));
-+	&xorps	($inout4,$rndkey0);
-+	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
-+	&xorps	($inout5,$rndkey1);
-+	&movups	(&QWP(0,$out),$inout0);
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&lea	($inp,&DWP(0x60,$inp));
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&mov	($rounds,$rounds_)		# restore $rounds
-+	&movups	(&QWP(0x30,$out),$inout3);
-+	&mov	($key,$key_);			# restore $key
-+	&movups	(&QWP(0x40,$out),$inout4);
-+	&lea	($out,&DWP(0x50,$out));
-+	&sub	($len,0x60);
-+	&ja	(&label("cbc_dec_loop6"));
++	movups	($inp),$rndkey1		# re-load input
++	movups	0x10($inp),$rndkey0
++	xorps	$reserved(%rsp),$inout0	# ^= IV
++	xorps	$rndkey1,$inout1
++	movups	0x20($inp),$rndkey1
++	xorps	$rndkey0,$inout2
++	movups	0x30($inp),$rndkey0
++	xorps	$rndkey1,$inout3
++	movups	0x40($inp),$rndkey1
++	xorps	$rndkey0,$inout4
++	movups	0x50($inp),$rndkey0
++	xorps	$rndkey1,$inout5
++	movups	0x60($inp),$rndkey1
++	xorps	$rndkey0,$inout6
++	movups	0x70($inp),$rndkey0	# IV
++	xorps	$rndkey1,$inout7
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	mov	$rnds_,$rounds		# restore $rounds
++	movups	$inout4,0x40($out)
++	mov	$key_,$key		# restore $key
++	movups	$inout5,0x50($out)
++	lea	0x80($inp),$inp
++	movups	$inout6,0x60($out)
++	lea	0x70($out),$out
++	sub	\$0x80,$len
++	ja	.Lcbc_dec_loop8
++
++	movaps	$inout7,$inout0
++	movaps	$rndkey0,$iv
++	add	\$0x70,$len
++	jle	.Lcbc_dec_tail_collected
++	movups	$inout0,($out)
++	lea	1($rnds_,$rnds_),$rounds
++	lea	0x10($out),$out
++.Lcbc_dec_tail:
+ 	movups	($inp),$inout0
+-	cmp	\$0x10,$len
+ 	movaps	$inout0,$in0
++	cmp	\$0x10,$len
+ 	jbe	.Lcbc_dec_one
 +
-+	&movaps	($inout0,$inout5);
-+	&movaps	($ivec,$rndkey0);
-+	&add	($len,0x50);
-+	&jle	(&label("cbc_dec_tail_collected"));
-+	&movups	(&QWP(0,$out),$inout0);
-+	&lea	($out,&DWP(0x10,$out));
- &set_label("cbc_dec_tail");
--	&add	($len,0x40);
--	&jz	(&label("cbc_ret"));
--
- 	&movups	($inout0,&QWP(0,$inp));
--	&cmp	($len,0x10);
- 	&movaps	($in0,$inout0);
-+	&cmp	($len,0x10);
- 	&jbe	(&label("cbc_dec_one"));
+ 	movups	0x10($inp),$inout1
+-	cmp	\$0x20,$len
+ 	movaps	$inout1,$in1
++	cmp	\$0x20,$len
+ 	jbe	.Lcbc_dec_two
 +
- 	&movups	($inout1,&QWP(0x10,$inp));
--	&cmp	($len,0x20);
- 	&movaps	($in1,$inout1);
-+	&cmp	($len,0x20);
- 	&jbe	(&label("cbc_dec_two"));
+ 	movups	0x20($inp),$inout2
+-	cmp	\$0x30,$len
+ 	movaps	$inout2,$in2
++	cmp	\$0x30,$len
+ 	jbe	.Lcbc_dec_three
 +
- 	&movups	($inout2,&QWP(0x20,$inp));
- 	&cmp	($len,0x30);
- 	&jbe	(&label("cbc_dec_three"));
+ 	movups	0x30($inp),$inout3
+-	call	_aesni_decrypt4
+-	pxor	$iv,$inout0
+-	movups	0x30($inp),$iv
+-	pxor	$in0,$inout1
++	cmp	\$0x40,$len
++	jbe	.Lcbc_dec_four
 +
- 	&movups	($inout3,&QWP(0x30,$inp));
--	&call	("_aesni_decrypt4");
-+	&cmp	($len,0x40);
-+	&jbe	(&label("cbc_dec_four"));
++	movups	0x40($inp),$inout4
++	cmp	\$0x50,$len
++	jbe	.Lcbc_dec_five
 +
-+	&movups	($inout4,&QWP(0x40,$inp));
-+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
-+	&movups	($inout0,&QWP(0,$inp));
-+	&xorps	($inout5,$inout5);
-+	&call	("_aesni_decrypt6");
-+	&movups	($rndkey1,&QWP(0,$inp));
- 	&movups	($rndkey0,&QWP(0x10,$inp));
-+	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
-+	&xorps	($inout1,$rndkey1);
- 	&movups	($rndkey1,&QWP(0x20,$inp));
--	&pxor	($inout0,$ivec);
--	&pxor	($inout1,$in0);
--	&movups	($ivec,&QWP(0x30,$inp));
-+	&xorps	($inout2,$rndkey0);
-+	&movups	($rndkey0,&QWP(0x30,$inp));
-+	&xorps	($inout3,$rndkey1);
-+	&movups	($ivec,&QWP(0x40,$inp));	# IV
-+	&xorps	($inout4,$rndkey0);
- 	&movups	(&QWP(0,$out),$inout0);
--	&pxor	($inout2,$rndkey0);
--	&pxor	($inout3,$rndkey1);
- 	&movups	(&QWP(0x10,$out),$inout1);
- 	&movups	(&QWP(0x20,$out),$inout2);
--	&movaps	($inout0,$inout3);
--	&lea	($out,&DWP(0x30,$out));
-+	&movups	(&QWP(0x30,$out),$inout3);
-+	&lea	($out,&DWP(0x40,$out));
-+	&movaps	($inout0,$inout4);
-+	&sub	($len,0x50);
- 	&jmp	(&label("cbc_dec_tail_collected"));
- 
--&set_label("cbc_dec_one");
--	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt3");
--	&pxor	($inout0,$ivec);
-+&set_label("cbc_dec_one",16);
-+	if ($inline)
-+	{   &aesni_inline_generate1("dec");	}
-+	else
-+	{   &call	("_aesni_decrypt1");	}
-+	&xorps	($inout0,$ivec);
- 	&movaps	($ivec,$in0);
-+	&sub	($len,0x10);
- 	&jmp	(&label("cbc_dec_tail_collected"));
- 
--&set_label("cbc_dec_two");
-+&set_label("cbc_dec_two",16);
-+	&xorps	($inout2,$inout2);
- 	&call	("_aesni_decrypt3");
--	&pxor	($inout0,$ivec);
--	&pxor	($inout1,$in0);
-+	&xorps	($inout0,$ivec);
-+	&xorps	($inout1,$in0);
- 	&movups	(&QWP(0,$out),$inout0);
- 	&movaps	($inout0,$inout1);
--	&movaps	($ivec,$in1);
- 	&lea	($out,&DWP(0x10,$out));
-+	&movaps	($ivec,$in1);
-+	&sub	($len,0x20);
- 	&jmp	(&label("cbc_dec_tail_collected"));
- 
--&set_label("cbc_dec_three");
-+&set_label("cbc_dec_three",16);
- 	&call	("_aesni_decrypt3");
--	&pxor	($inout0,$ivec);
--	&pxor	($inout1,$in0);
--	&pxor	($inout2,$in1);
-+	&xorps	($inout0,$ivec);
-+	&xorps	($inout1,$in0);
-+	&xorps	($inout2,$in1);
- 	&movups	(&QWP(0,$out),$inout0);
--	&movups	(&QWP(0x10,$out),$inout1);
- 	&movaps	($inout0,$inout2);
--	&movups	($ivec,&QWP(0x20,$inp));
-+	&movups	(&QWP(0x10,$out),$inout1);
- 	&lea	($out,&DWP(0x20,$out));
-+	&movups	($ivec,&QWP(0x20,$inp));
-+	&sub	($len,0x30);
-+	&jmp	(&label("cbc_dec_tail_collected"));
++	movups	0x50($inp),$inout5
++	cmp	\$0x60,$len
++	jbe	.Lcbc_dec_six
 +
-+&set_label("cbc_dec_four",16);
-+	&call	("_aesni_decrypt4");
-+	&movups	($rndkey1,&QWP(0x10,$inp));
-+	&movups	($rndkey0,&QWP(0x20,$inp));
-+	&xorps	($inout0,$ivec);
-+	&movups	($ivec,&QWP(0x30,$inp));
-+	&xorps	($inout1,$in0);
-+	&movups	(&QWP(0,$out),$inout0);
-+	&xorps	($inout2,$rndkey1);
-+	&movups	(&QWP(0x10,$out),$inout1);
-+	&xorps	($inout3,$rndkey0);
-+	&movups	(&QWP(0x20,$out),$inout2);
-+	&lea	($out,&DWP(0x30,$out));
-+	&movaps	($inout0,$inout3);
-+	&sub	($len,0x40);
- 
- &set_label("cbc_dec_tail_collected");
- 	&and	($len,15);
-@@ -506,21 +779,21 @@ if ($PREFIX eq "aesni") {
- 	&movups	(&QWP(0,$out),$inout0);
- 	&jmp	(&label("cbc_ret"));
- 
--&set_label("cbc_dec_tail_partial");
--	&mov	($key_,"esp");
--	&sub	("esp",16);
--	&and	("esp",-16);
-+&set_label("cbc_dec_tail_partial",16);
- 	&movaps	(&QWP(0,"esp"),$inout0);
-+	&mov	("ecx",16);
- 	&mov	($inp,"esp");
--	&mov	("ecx",$len);
-+	&sub	("ecx",$len);
- 	&data_word(0xA4F3F689);		# rep movsb
--	&mov	("esp",$key_);
- 
- &set_label("cbc_ret");
-+	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
- 	&mov	($key_,&wparam(4));
- 	&movups	(&QWP(0,$key_),$ivec);	# output IV
-+&set_label("cbc_abort");
- &function_end("${PREFIX}_cbc_encrypt");
--
-+
-+######################################################################
- # Mechanical port from aesni-x86_64.pl.
- #
- # _aesni_set_encrypt_key is private interface,
-@@ -539,7 +812,7 @@ if ($PREFIX eq "aesni") {
- 	&jz	(&label("bad_pointer"));
++	movups	0x60($inp),$inout6
++	movaps	$iv,$reserved(%rsp)	# save IV
++	call	_aesni_decrypt8
++	movups	($inp),$rndkey1
++	movups	0x10($inp),$rndkey0
++	xorps	$reserved(%rsp),$inout0	# ^= IV
++	xorps	$rndkey1,$inout1
++	movups	0x20($inp),$rndkey1
++	xorps	$rndkey0,$inout2
++	movups	0x30($inp),$rndkey0
++	xorps	$rndkey1,$inout3
++	movups	0x40($inp),$rndkey1
++	xorps	$rndkey0,$inout4
++	movups	0x50($inp),$rndkey0
++	xorps	$rndkey1,$inout5
++	movups	0x60($inp),$iv		# IV
++	xorps	$rndkey0,$inout6
+ 	movups	$inout0,($out)
+-	pxor	$in1,$inout2
+ 	movups	$inout1,0x10($out)
+-	pxor	$in2,$inout3
+ 	movups	$inout2,0x20($out)
+-	movaps	$inout3,$inout0
+-	lea	0x30($out),$out
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	movups	$inout5,0x50($out)
++	lea	0x60($out),$out
++	movaps	$inout6,$inout0
++	sub	\$0x70,$len
+ 	jmp	.Lcbc_dec_tail_collected
+ .align	16
+ .Lcbc_dec_one:
+ ___
+ 	&aesni_generate1("dec",$key,$rounds);
+ $code.=<<___;
+-	pxor	$iv,$inout0
++	xorps	$iv,$inout0
+ 	movaps	$in0,$iv
++	sub	\$0x10,$len
+ 	jmp	.Lcbc_dec_tail_collected
+ .align	16
+ .Lcbc_dec_two:
++	xorps	$inout2,$inout2
+ 	call	_aesni_decrypt3
+-	pxor	$iv,$inout0
+-	pxor	$in0,$inout1
++	xorps	$iv,$inout0
++	xorps	$in0,$inout1
+ 	movups	$inout0,($out)
+ 	movaps	$in1,$iv
+ 	movaps	$inout1,$inout0
+ 	lea	0x10($out),$out
++	sub	\$0x20,$len
+ 	jmp	.Lcbc_dec_tail_collected
+ .align	16
+ .Lcbc_dec_three:
+ 	call	_aesni_decrypt3
+-	pxor	$iv,$inout0
+-	pxor	$in0,$inout1
++	xorps	$iv,$inout0
++	xorps	$in0,$inout1
+ 	movups	$inout0,($out)
+-	pxor	$in1,$inout2
++	xorps	$in1,$inout2
+ 	movups	$inout1,0x10($out)
+ 	movaps	$in2,$iv
+ 	movaps	$inout2,$inout0
+ 	lea	0x20($out),$out
++	sub	\$0x30,$len
++	jmp	.Lcbc_dec_tail_collected
++.align	16
++.Lcbc_dec_four:
++	call	_aesni_decrypt4
++	xorps	$iv,$inout0
++	movups	0x30($inp),$iv
++	xorps	$in0,$inout1
++	movups	$inout0,($out)
++	xorps	$in1,$inout2
++	movups	$inout1,0x10($out)
++	xorps	$in2,$inout3
++	movups	$inout2,0x20($out)
++	movaps	$inout3,$inout0
++	lea	0x30($out),$out
++	sub	\$0x40,$len
++	jmp	.Lcbc_dec_tail_collected
++.align	16
++.Lcbc_dec_five:
++	xorps	$inout5,$inout5
++	call	_aesni_decrypt6
++	movups	0x10($inp),$rndkey1
++	movups	0x20($inp),$rndkey0
++	xorps	$iv,$inout0
++	xorps	$in0,$inout1
++	xorps	$rndkey1,$inout2
++	movups	0x30($inp),$rndkey1
++	xorps	$rndkey0,$inout3
++	movups	0x40($inp),$iv
++	xorps	$rndkey1,$inout4
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	lea	0x40($out),$out
++	movaps	$inout4,$inout0
++	sub	\$0x50,$len
++	jmp	.Lcbc_dec_tail_collected
++.align	16
++.Lcbc_dec_six:
++	call	_aesni_decrypt6
++	movups	0x10($inp),$rndkey1
++	movups	0x20($inp),$rndkey0
++	xorps	$iv,$inout0
++	xorps	$in0,$inout1
++	xorps	$rndkey1,$inout2
++	movups	0x30($inp),$rndkey1
++	xorps	$rndkey0,$inout3
++	movups	0x40($inp),$rndkey0
++	xorps	$rndkey1,$inout4
++	movups	0x50($inp),$iv
++	xorps	$rndkey0,$inout5
++	movups	$inout0,($out)
++	movups	$inout1,0x10($out)
++	movups	$inout2,0x20($out)
++	movups	$inout3,0x30($out)
++	movups	$inout4,0x40($out)
++	lea	0x50($out),$out
++	movaps	$inout5,$inout0
++	sub	\$0x60,$len
+ 	jmp	.Lcbc_dec_tail_collected
+ .align	16
+ .Lcbc_dec_tail_collected:
+@@ -523,10 +1114,12 @@ $code.=<<___;
+ 	jnz	.Lcbc_dec_tail_partial
+ 	movups	$inout0,($out)
+ 	jmp	.Lcbc_dec_ret
++.align	16
+ .Lcbc_dec_tail_partial:
+ 	movaps	$inout0,$reserved(%rsp)
++	mov	\$16,%rcx
+ 	mov	$out,%rdi
+-	mov	$len,%rcx
++	sub	$len,%rcx
+ 	lea	$reserved(%rsp),%rsi
+ 	.long	0x9066A4F3	# rep movsb
  
- 	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
--	&pxor	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
-+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
- 	&lea	($key,&DWP(16,$key));
- 	&cmp	($rounds,256);
- 	&je	(&label("14rounds"));
-@@ -581,11 +854,11 @@ if ($PREFIX eq "aesni") {
- 	&lea		($key,&DWP(16,$key));
- &set_label("key_128_cold");
- 	&shufps		("xmm4","xmm0",0b00010000);
--	&pxor		("xmm0","xmm4");
--	&shufps		("xmm4","xmm0",0b10001100,);
--	&pxor		("xmm0","xmm4");
--	&pshufd		("xmm1","xmm1",0b11111111);	# critical path
--	&pxor		("xmm0","xmm1");
-+	&xorps		("xmm0","xmm4");
-+	&shufps		("xmm4","xmm0",0b10001100);
-+	&xorps		("xmm0","xmm4");
-+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
-+	&xorps		("xmm0","xmm1");
- 	&ret();
+@@ -544,7 +1137,7 @@ $code.=<<___;
+ 	ret
+ .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+ ___
+-
++} 
+ # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+ #				int bits, AES_KEY *key)
+ { my ($inp,$bits,$key) = @_4args;
+@@ -556,7 +1149,7 @@ $code.=<<___;
+ .align	16
+ ${PREFIX}_set_decrypt_key:
+ 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+-	call	_aesni_set_encrypt_key
++	call	__aesni_set_encrypt_key
+ 	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
+ 	test	%eax,%eax
+ 	jnz	.Ldec_key_ret
+@@ -576,9 +1169,9 @@ ${PREFIX}_set_decrypt_key:
+ 	aesimc	%xmm1,%xmm1
+ 	lea	16($key),$key
+ 	lea	-16($inp),$inp
+-	cmp	$key,$inp
+ 	$movkey	%xmm0,16($inp)
+ 	$movkey	%xmm1,-16($key)
++	cmp	$key,$inp
+ 	ja	.Ldec_key_inverse
  
- &set_label("12rounds",16);
-@@ -620,11 +893,11 @@ if ($PREFIX eq "aesni") {
- 	&movaps		("xmm5","xmm2");
- &set_label("key_192b_warm");
- 	&shufps		("xmm4","xmm0",0b00010000);
--	&movaps		("xmm3","xmm2");
--	&pxor		("xmm0","xmm4");
-+	&movdqa		("xmm3","xmm2");
-+	&xorps		("xmm0","xmm4");
- 	&shufps		("xmm4","xmm0",0b10001100);
- 	&pslldq		("xmm3",4);
--	&pxor		("xmm0","xmm4");
-+	&xorps		("xmm0","xmm4");
- 	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
- 	&pxor		("xmm2","xmm3");
- 	&pxor		("xmm0","xmm1");
-@@ -683,11 +956,11 @@ if ($PREFIX eq "aesni") {
- 	&lea		($key,&DWP(16,$key));
- &set_label("key_256a_cold");
- 	&shufps		("xmm4","xmm0",0b00010000);
--	&pxor		("xmm0","xmm4");
-+	&xorps		("xmm0","xmm4");
- 	&shufps		("xmm4","xmm0",0b10001100);
--	&pxor		("xmm0","xmm4");
--	&pshufd		("xmm1","xmm1",0b11111111);	# critical path
--	&pxor		("xmm0","xmm1");
-+	&xorps		("xmm0","xmm4");
-+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
-+	&xorps		("xmm0","xmm1");
- 	&ret();
+ 	$movkey	($key),%xmm0		# inverse middle
+@@ -605,16 +1198,16 @@ $code.=<<___;
+ .type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
+ .align	16
+ ${PREFIX}_set_encrypt_key:
+-_aesni_set_encrypt_key:
++__aesni_set_encrypt_key:
+ 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+-	test	$inp,$inp
+ 	mov	\$-1,%rax
++	test	$inp,$inp
+ 	jz	.Lenc_key_ret
+ 	test	$key,$key
+ 	jz	.Lenc_key_ret
  
- &set_label("key_256b",16);
-@@ -695,11 +968,11 @@ if ($PREFIX eq "aesni") {
- 	&lea		($key,&DWP(16,$key));
+ 	movups	($inp),%xmm0		# pull first 128 bits of *userKey
+-	pxor	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
++	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+ 	lea	16($key),%rax
+ 	cmp	\$256,$bits
+ 	je	.L14rounds
+@@ -729,11 +1322,11 @@ _aesni_set_encrypt_key:
+ 	lea	16(%rax),%rax
+ .Lkey_expansion_128_cold:
+ 	shufps	\$0b00010000,%xmm0,%xmm4
+-	pxor	%xmm4, %xmm0
++	xorps	%xmm4, %xmm0
+ 	shufps	\$0b10001100,%xmm0,%xmm4
+-	pxor	%xmm4, %xmm0
+-	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
+-	pxor	%xmm1,%xmm0
++	xorps	%xmm4, %xmm0
++	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
++	xorps	%xmm1,%xmm0
+ 	ret
  
- 	&shufps		("xmm4","xmm2",0b00010000);
--	&pxor		("xmm2","xmm4");
-+	&xorps		("xmm2","xmm4");
- 	&shufps		("xmm4","xmm2",0b10001100);
--	&pxor		("xmm2","xmm4");
--	&pshufd		("xmm1","xmm1",0b10101010);	# critical path
--	&pxor		("xmm2","xmm1");
-+	&xorps		("xmm2","xmm4");
-+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
-+	&xorps		("xmm2","xmm1");
- 	&ret();
+ .align 16
+@@ -744,11 +1337,11 @@ _aesni_set_encrypt_key:
+ 	movaps	%xmm2, %xmm5
+ .Lkey_expansion_192b_warm:
+ 	shufps	\$0b00010000,%xmm0,%xmm4
+-	movaps	%xmm2,%xmm3
+-	pxor	%xmm4,%xmm0
++	movdqa	%xmm2,%xmm3
++	xorps	%xmm4,%xmm0
+ 	shufps	\$0b10001100,%xmm0,%xmm4
+ 	pslldq	\$4,%xmm3
+-	pxor	%xmm4,%xmm0
++	xorps	%xmm4,%xmm0
+ 	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
+ 	pxor	%xmm3,%xmm2
+ 	pxor	%xmm1,%xmm0
+@@ -772,11 +1365,11 @@ _aesni_set_encrypt_key:
+ 	lea	16(%rax),%rax
+ .Lkey_expansion_256a_cold:
+ 	shufps	\$0b00010000,%xmm0,%xmm4
+-	pxor	%xmm4,%xmm0
++	xorps	%xmm4,%xmm0
+ 	shufps	\$0b10001100,%xmm0,%xmm4
+-	pxor	%xmm4,%xmm0
+-	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
+-	pxor	%xmm1,%xmm0
++	xorps	%xmm4,%xmm0
++	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
++	xorps	%xmm1,%xmm0
+ 	ret
  
- &set_label("bad_pointer",4);
-@@ -747,9 +1020,9 @@ if ($PREFIX eq "aesni") {
- 	&aesimc		("xmm1","xmm1");
- 	&lea		($key,&DWP(16,$key));
- 	&lea		("eax",&DWP(-16,"eax"));
--	&cmp		("eax",$key);
- 	&$movekey	(&QWP(16,"eax"),"xmm0");
- 	&$movekey	(&QWP(-16,$key),"xmm1");
-+	&cmp		("eax",$key);
- 	&ja		(&label("dec_key_inverse"));
+ .align 16
+@@ -785,17 +1378,28 @@ _aesni_set_encrypt_key:
+ 	lea	16(%rax),%rax
  
- 	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
-diff -up openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl.intelopts openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl
---- openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl.intelopts	2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl	2011-08-24 12:50:56.000000000 +0200
+ 	shufps	\$0b00010000,%xmm2,%xmm4
+-	pxor	%xmm4,%xmm2
++	xorps	%xmm4,%xmm2
+ 	shufps	\$0b10001100,%xmm2,%xmm4
+-	pxor	%xmm4,%xmm2
+-	pshufd	\$0b10101010,%xmm1,%xmm1	# critical path
+-	pxor	%xmm1,%xmm2
++	xorps	%xmm4,%xmm2
++	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
++	xorps	%xmm1,%xmm2
+ 	ret
+ .size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
++.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+ ___
+ }
+ 
+ $code.=<<___;
++.align	64
++.Lbswap_mask:
++	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
++.Lincrement32:
++	.long	6,6,6,0
++.Lincrement64:
++	.long	1,0,0,0
++.Lxts_magic:
++	.long	0x87,0,1,0
++
+ .asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+ .align	64
+ ___
+diff -up openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl.intelopts openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl
+--- openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl.intelopts	2013-02-19 21:15:39.390403182 +0100
++++ openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl	2013-02-19 21:15:39.425403896 +0100
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env perl
 +#!/usr/bin/perl
- #
+ 
  # ====================================================================
  # Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -11,6 +11,145 @@
+@@ -11,10 +11,37 @@
  # OpenSSL context it's used with Intel engine, but can also be used as
- # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+ # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
  # details].
 +#
 +# Performance.
 +#
-+# Given aes(enc|dec) instructions' latency asymptotic performance for
-+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
-+# processed with 128-bit key. And given their throughput asymptotic
-+# performance for parallelizable modes is 1.25 cycles per byte. Being
-+# asymptotic limit it's not something you commonly achieve in reality,
-+# but how close does one get? Below are results collected for
-+# different modes and block sized. Pairs of numbers are for en-/
-+# decryption.
++# To start with see corresponding paragraph in aesni-x86_64.pl...
++# Instead of filling table similar to one found there I've chosen to
++# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
++# The simplified table below represents 32-bit performance relative
++# to 64-bit one in every given point. Ratios vary for different
++# encryption modes, therefore interval values.
 +#
 +#	16-byte     64-byte     256-byte    1-KB        8-KB
-+# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
-+# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
-+# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
-+# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
-+# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
-+# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
-+#
-+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
-+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
-+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
-+# The results were collected with specially crafted speed.c benchmark
-+# in order to compare them with results reported in "Intel Advanced
-+# Encryption Standard (AES) New Instruction Set" White Paper Revision
-+# 3.0 dated May 2010. All above results are consistently better. This
-+# module also provides better performance for block sizes smaller than
-+# 128 bytes in points *not* represented in the above table.
-+#
-+# Looking at the results for 8-KB buffer.
-+#
-+# CFB and OFB results are far from the limit, because implementation
-+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
-+# single-block aesni_encrypt, which is not the most optimal way to go.
-+# CBC encrypt result is unexpectedly high and there is no documented
-+# explanation for it. Seemingly there is a small penalty for feeding
-+# the result back to AES unit the way it's done in CBC mode. There is
-+# nothing one can do and the result appears optimal. CCM result is
-+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
-+# saving output. CCM CTR "stays invisible," because it's neatly
-+# interleaved wih CBC-MAC. This provides ~30% improvement over
-+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
-+# disjointly. Parallelizable modes practically achieve the theoretical
-+# limit.
-+#
-+# Looking at how results vary with buffer size.
-+#
-+# Curves are practically saturated at 1-KB buffer size. In most cases
-+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
-+# CTR curve doesn't follow this pattern and is "slowest" changing one
-+# with "256-byte" result being 87% of "8-KB." This is because overhead
-+# in CTR mode is most computationally intensive. Small-block CCM
-+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
-+# iterations can't be interleaved.
-+#
-+# Results for 192- and 256-bit keys.
++#	53-67%      67-84%      91-94%      95-98%      97-99.5%
 +#
-+# EVP-free results were observed to scale perfectly with number of
-+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
-+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
-+# are a tad smaller, because the above mentioned penalty biases all
-+# results by same constant value. In similar way function call
-+# overhead affects small-block performance, as well as OFB and CFB
-+# results. Differences are not large, most common coefficients are
-+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
-+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
++# Lower ratios for smaller block sizes are perfectly understandable,
++# because function call overhead is higher in 32-bit mode. Largest
++# 8-KB block performance is virtually same: 32-bit code is less than
++# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
 +
 +# January 2011
 +#
-+# While Westmere processor features 6 cycles latency for aes[enc|dec]
-+# instructions, which can be scheduled every second cycle, Sandy
-+# Bridge spends 8 cycles per instruction, but it can schedule them
-+# every cycle. This means that code targeting Westmere would perform
-+# suboptimally on Sandy Bridge. Therefore this update.
-+#
-+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
-+# optimized. Relative improvement might appear modest, 8% on Westmere,
-+# but in absolute terms it's 3.77 cycles per byte encrypted with
-+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
-+# should be compared to asymptotic limits of 3.75 for Westmere and
-+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
-+# to asymptotic limits is quite amazing. Indeed, the limit is
-+# calculated as latency times number of rounds, 10 for 128-bit key,
-+# and divided by 16, the number of bytes in block, or in other words
-+# it accounts *solely* for aesenc instructions. But there are extra
-+# instructions, and numbers so close to the asymptotic limits mean
-+# that it's as if it takes as little as *one* additional cycle to
-+# execute all of them. How is it possible? It is possible thanks to
-+# out-of-order execution logic, which manages to overlap post-
-+# processing of previous block, things like saving the output, with
-+# actual encryption of current block, as well as pre-processing of
-+# current block, things like fetching input and xor-ing it with
-+# 0-round element of the key schedule, with actual encryption of
-+# previous block. Keep this in mind...
-+#
-+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
-+# performance is achieved by interleaving instructions working on
-+# independent blocks. In which case asymptotic limit for such modes
-+# can be obtained by dividing above mentioned numbers by AES
-+# instructions' interleave factor. Westmere can execute at most 3 
-+# instructions at a time, meaning that optimal interleave factor is 3,
-+# and that's where the "magic" number of 1.25 come from. "Optimal
-+# interleave factor" means that increase of interleave factor does
-+# not improve performance. The formula has proven to reflect reality
-+# pretty well on Westmere... Sandy Bridge on the other hand can
-+# execute up to 8 AES instructions at a time, so how does varying
-+# interleave factor affect the performance? Here is table for ECB
-+# (numbers are cycles per byte processed with 128-bit key):
-+#
-+# instruction interleave factor		3x	6x	8x
-+# theoretical asymptotic limit		1.67	0.83	0.625
-+# measured performance for 8KB block	1.05	0.86	0.84
-+#
-+# "as if" interleave factor		4.7x	5.8x	6.0x
-+#
-+# Further data for other parallelizable modes:
-+#
-+# CBC decrypt				1.16	0.93	0.93
-+# CTR					1.14	0.91	n/a
-+#
-+# Well, given 3x column it's probably inappropriate to call the limit
-+# asymptotic, if it can be surpassed, isn't it? What happens there?
-+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
-+# magic is responsible for this. Processor overlaps not only the
-+# additional instructions with AES ones, but even AES instuctions
-+# processing adjacent triplets of independent blocks. In the 6x case
-+# additional instructions  still claim disproportionally small amount
-+# of additional cycles, but in 8x case number of instructions must be
-+# a tad too high for out-of-order logic to cope with, and AES unit
-+# remains underutilized... As you can see 8x interleave is hardly
-+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
-+# utilizies 6x interleave because of limited register bank capacity.
-+#
-+# Higher interleave factors do have negative impact on Westmere
-+# performance. While for ECB mode it's negligible ~1.5%, other
-+# parallelizables perform ~5% worse, which is outweighed by ~25%
-+# improvement on Sandy Bridge. To balance regression on Westmere
-+# CTR mode was implemented with 6x aesenc interleave factor.
++# See aesni-x86_64.pl for details. Unlike x86_64 version this module
++# interleaves at most 6 aes[enc|dec] instructions, because there are
++# not enough registers for 8x interleave [which should be optimal for
++# Sandy Bridge]. Actually, performance results for 6x interleave
++# factor presented in aesni-x86_64.pl (except for CTR) are for this
++# module.
  
  $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
  			# generates drop-in replacement for
-@@ -29,7 +168,7 @@ die "can't locate x86_64-xlate.pl";
+ 			# crypto/aes/asm/aes-586.pl:-)
++$inline=1;		# inline _aesni_[en|de]crypt
  
- open STDOUT,"| $^X $xlate $flavour $output";
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ push(@INC,"${dir}","${dir}../../perlasm");
+@@ -22,7 +49,8 @@ require "x86asm.pl";
  
--$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
-+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
- @_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
- 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+ &asm_init($ARGV[0],$0);
  
-@@ -41,18 +180,20 @@ $inp="%rdi";
- $out="%rsi";
- $len="%rdx";
- $key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
--$ivp="%r8";	# cbc
-+$ivp="%r8";	# cbc, ctr, ...
+-$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
++if ($PREFIX eq "aesni")	{ $movekey=*movups; }
++else			{ $movekey=*movups; }
  
- $rnds_="%r10d";	# backup copy for $rounds
- $key_="%r11";	# backup copy for $key
+ $len="eax";
+ $rounds="ecx";
+@@ -32,114 +60,144 @@ $out="edi";
+ $rounds_="ebx";	# backup copy for $rounds
+ $key_="ebp";	# backup copy for $key
+ 
+-$inout0="xmm0";
+-$inout1="xmm1";
+-$inout2="xmm2";
+-$rndkey0="xmm3";
+-$rndkey1="xmm4";
+-$ivec="xmm5";
+-$in0="xmm6";
+-$in1="xmm7";	$inout3="xmm7";
+-
++$rndkey0="xmm0";
++$rndkey1="xmm1";
++$inout0="xmm2";
++$inout1="xmm3";
++$inout2="xmm4";
++$inout3="xmm5";	$in1="xmm5";
++$inout4="xmm6";	$in0="xmm6";
++$inout5="xmm7";	$ivec="xmm7";
++
++# AESNI extenstion
++sub aeskeygenassist
++{ my($dst,$src,$imm)=@_;
++    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
++}
++sub aescommon
++{ my($opcodelet,$dst,$src)=@_;
++    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
++}
++sub aesimc	{ aescommon(0xdb, at _); }
++sub aesenc	{ aescommon(0xdc, at _); }
++sub aesenclast	{ aescommon(0xdd, at _); }
++sub aesdec	{ aescommon(0xde, at _); }
++sub aesdeclast	{ aescommon(0xdf, at _); }
++
+ # Inline version of internal aesni_[en|de]crypt1
++{ my $sn;
+ sub aesni_inline_generate1
+-{ my $p=shift;
++{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
++  $sn++;
+ 
+     &$movekey		($rndkey0,&QWP(0,$key));
+     &$movekey		($rndkey1,&QWP(16,$key));
++    &xorps		($ivec,$rndkey0)	if (defined($ivec));
+     &lea		($key,&DWP(32,$key));
+-    &pxor		($inout0,$rndkey0);
+-    &set_label("${p}1_loop");
+-	eval"&aes${p}	($inout0,$rndkey1)";
++    &xorps		($inout,$ivec)		if (defined($ivec));
++    &xorps		($inout,$rndkey0)	if (!defined($ivec));
++    &set_label("${p}1_loop_$sn");
++	eval"&aes${p}	($inout,$rndkey1)";
+ 	&dec		($rounds);
+ 	&$movekey	($rndkey1,&QWP(0,$key));
+ 	&lea		($key,&DWP(16,$key));
+-    &jnz		(&label("${p}1_loop"));
+-    eval"&aes${p}last	($inout0,$rndkey1)";
+-}
++    &jnz		(&label("${p}1_loop_$sn"));
++    eval"&aes${p}last	($inout,$rndkey1)";
++}}
  
- # %xmm register layout
--$inout0="%xmm0";	$inout1="%xmm1";
--$inout2="%xmm2";	$inout3="%xmm3";
--$rndkey0="%xmm4";	$rndkey1="%xmm5";
-+$rndkey0="%xmm0";	$rndkey1="%xmm1";
-+$inout0="%xmm2";	$inout1="%xmm3";
-+$inout2="%xmm4";	$inout3="%xmm5";
-+$inout4="%xmm6";	$inout5="%xmm7";
-+$inout6="%xmm8";	$inout7="%xmm9";
+ sub aesni_generate1	# fully unrolled loop
+-{ my $p=shift;
++{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
  
--$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt
--$in1="%xmm8";		$in2="%xmm9";
-+$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
-+$in0="%xmm8";		$iv="%xmm9";
- 
- # Inline version of internal aesni_[en|de]crypt1.
- #
-@@ -60,20 +201,29 @@ $in1="%xmm8";		$in2="%xmm9";
- # cycles which take care of loop variables...
- { my $sn;
- sub aesni_generate1 {
--my ($p,$key,$rounds)=@_;
-+my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
- ++$sn;
- $code.=<<___;
- 	$movkey	($key),$rndkey0
- 	$movkey	16($key),$rndkey1
-+___
-+$code.=<<___ if (defined($ivec));
-+	xorps	$rndkey0,$ivec
- 	lea	32($key),$key
--	pxor	$rndkey0,$inout0
-+	xorps	$ivec,$inout
-+___
-+$code.=<<___ if (!defined($ivec));
-+	lea	32($key),$key
-+	xorps	$rndkey0,$inout
-+___
-+$code.=<<___;
- .Loop_${p}1_$sn:
--	aes${p}	$rndkey1,$inout0
-+	aes${p}	$rndkey1,$inout
- 	dec	$rounds
- 	$movkey	($key),$rndkey1
- 	lea	16($key),$key
- 	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
--	aes${p}last	$rndkey1,$inout0
-+	aes${p}last	$rndkey1,$inout
- ___
- }}
- # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
-@@ -86,7 +236,7 @@ $code.=<<___;
- .align	16
- ${PREFIX}_encrypt:
- 	movups	($inp),$inout0		# load input
--	mov	240($key),$rounds	# pull $rounds
-+	mov	240($key),$rounds	# key->rounds
- ___
- 	&aesni_generate1("enc",$key,$rounds);
- $code.=<<___;
-@@ -99,7 +249,7 @@ $code.=<<___;
- .align	16
- ${PREFIX}_decrypt:
- 	movups	($inp),$inout0		# load input
--	mov	240($key),$rounds	# pull $rounds
-+	mov	240($key),$rounds	# key->rounds
- ___
- 	&aesni_generate1("dec",$key,$rounds);
- $code.=<<___;
-@@ -109,16 +259,16 @@ $code.=<<___;
- ___
+     &function_begin_B("_aesni_${p}rypt1");
+-	&$movekey	($rndkey0,&QWP(0,$key));
++	&movups		($rndkey0,&QWP(0,$key));
+ 	&$movekey	($rndkey1,&QWP(0x10,$key));
+-	&cmp		($rounds,11);
+-	&pxor		($inout0,$rndkey0);
++	&xorps		($inout,$rndkey0);
+ 	&$movekey	($rndkey0,&QWP(0x20,$key));
+ 	&lea		($key,&DWP(0x30,$key));
++	&cmp		($rounds,11);
+ 	&jb		(&label("${p}128"));
+ 	&lea		($key,&DWP(0x20,$key));
+ 	&je		(&label("${p}192"));
+ 	&lea		($key,&DWP(0x20,$key));
+-	eval"&aes${p}	($inout0,$rndkey1)";
++	eval"&aes${p}	($inout,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(-0x40,$key));
+-	eval"&aes${p}	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout,$rndkey0)";
+ 	&$movekey	($rndkey0,&QWP(-0x30,$key));
+     &set_label("${p}192");
+-	eval"&aes${p}	($inout0,$rndkey1)";
++	eval"&aes${p}	($inout,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(-0x20,$key));
+-	eval"&aes${p}	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout,$rndkey0)";
+ 	&$movekey	($rndkey0,&QWP(-0x10,$key));
+     &set_label("${p}128");
+-	eval"&aes${p}	($inout0,$rndkey1)";
++	eval"&aes${p}	($inout,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(0,$key));
+-	eval"&aes${p}	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout,$rndkey0)";
+ 	&$movekey	($rndkey0,&QWP(0x10,$key));
+-	eval"&aes${p}	($inout0,$rndkey1)";
++	eval"&aes${p}	($inout,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(0x20,$key));
+-	eval"&aes${p}	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout,$rndkey0)";
+ 	&$movekey	($rndkey0,&QWP(0x30,$key));
+-	eval"&aes${p}	($inout0,$rndkey1)";
++	eval"&aes${p}	($inout,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(0x40,$key));
+-	eval"&aes${p}	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout,$rndkey0)";
+ 	&$movekey	($rndkey0,&QWP(0x50,$key));
+-	eval"&aes${p}	($inout0,$rndkey1)";
++	eval"&aes${p}	($inout,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(0x60,$key));
+-	eval"&aes${p}	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout,$rndkey0)";
+ 	&$movekey	($rndkey0,&QWP(0x70,$key));
+-	eval"&aes${p}	($inout0,$rndkey1)";
+-    eval"&aes${p}last	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout,$rndkey1)";
++    eval"&aes${p}last	($inout,$rndkey0)";
+     &ret();
+     &function_end_B("_aesni_${p}rypt1");
  }
- 
--# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
--# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
--# latency is 6, it turned out that it can be scheduled only every
--# *second* cycle. Thus 3x interleave is the one providing optimal
-+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
-+# factor. Why 3x subroutine were originally used in loops? Even though
-+# aes[enc|dec] latency was originally 6, it could be scheduled only
-+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
- # utilization, i.e. when subroutine's throughput is virtually same as
- # of non-interleaved subroutine [for number of input blocks up to 3].
--# This is why it makes no sense to implement 2x subroutine. As soon
--# as/if Intel improves throughput by making it possible to schedule
--# the instructions in question *every* cycles I would have to
--# implement 6x interleave and use it in loop...
-+# This is why it makes no sense to implement 2x subroutine.
-+# aes[enc|dec] latency in next processor generation is 8, but the
-+# instructions can be scheduled every cycle. Optimal interleave for
-+# new processor is therefore 8x...
- sub aesni_generate3 {
- my $dir=shift;
- # As already mentioned it takes in $key and $rounds, which are *not*
-@@ -131,25 +281,25 @@ _aesni_${dir}rypt3:
- 	shr	\$1,$rounds
- 	$movkey	16($key),$rndkey1
- 	lea	32($key),$key
--	pxor	$rndkey0,$inout0
--	pxor	$rndkey0,$inout1
--	pxor	$rndkey0,$inout2
-+	xorps	$rndkey0,$inout0
-+	xorps	$rndkey0,$inout1
-+	xorps	$rndkey0,$inout2
-+	$movkey		($key),$rndkey0
- 
- .L${dir}_loop3:
- 	aes${dir}	$rndkey1,$inout0
--	$movkey		($key),$rndkey0
- 	aes${dir}	$rndkey1,$inout1
- 	dec		$rounds
- 	aes${dir}	$rndkey1,$inout2
--	aes${dir}	$rndkey0,$inout0
- 	$movkey		16($key),$rndkey1
-+	aes${dir}	$rndkey0,$inout0
- 	aes${dir}	$rndkey0,$inout1
- 	lea		32($key),$key
- 	aes${dir}	$rndkey0,$inout2
-+	$movkey		($key),$rndkey0
- 	jnz		.L${dir}_loop3
- 
- 	aes${dir}	$rndkey1,$inout0
--	$movkey		($key),$rndkey0
- 	aes${dir}	$rndkey1,$inout1
- 	aes${dir}	$rndkey1,$inout2
- 	aes${dir}last	$rndkey0,$inout0
-@@ -175,28 +325,28 @@ _aesni_${dir}rypt4:
- 	shr	\$1,$rounds
- 	$movkey	16($key),$rndkey1
- 	lea	32($key),$key
--	pxor	$rndkey0,$inout0
--	pxor	$rndkey0,$inout1
--	pxor	$rndkey0,$inout2
--	pxor	$rndkey0,$inout3
-+	xorps	$rndkey0,$inout0
-+	xorps	$rndkey0,$inout1
-+	xorps	$rndkey0,$inout2
-+	xorps	$rndkey0,$inout3
-+	$movkey	($key),$rndkey0
+-
++
+ # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+-# &aesni_generate1("dec");
++&aesni_generate1("enc") if (!$inline);
+ &function_begin_B("${PREFIX}_encrypt");
+ 	&mov	("eax",&wparam(0));
+ 	&mov	($key,&wparam(2));
+ 	&movups	($inout0,&QWP(0,"eax"));
+ 	&mov	($rounds,&DWP(240,$key));
+ 	&mov	("eax",&wparam(1));
+-	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt1");
++	if ($inline)
++	{   &aesni_inline_generate1("enc");	}
++	else
++	{   &call	("_aesni_encrypt1");	}
+ 	&movups	(&QWP(0,"eax"),$inout0);
+ 	&ret	();
+ &function_end_B("${PREFIX}_encrypt");
  
- .L${dir}_loop4:
- 	aes${dir}	$rndkey1,$inout0
--	$movkey		($key),$rndkey0
- 	aes${dir}	$rndkey1,$inout1
- 	dec		$rounds
- 	aes${dir}	$rndkey1,$inout2
- 	aes${dir}	$rndkey1,$inout3
--	aes${dir}	$rndkey0,$inout0
- 	$movkey		16($key),$rndkey1
-+	aes${dir}	$rndkey0,$inout0
- 	aes${dir}	$rndkey0,$inout1
- 	lea		32($key),$key
- 	aes${dir}	$rndkey0,$inout2
- 	aes${dir}	$rndkey0,$inout3
-+	$movkey		($key),$rndkey0
- 	jnz		.L${dir}_loop4
+ # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+-# &aesni_generate1("dec");
++&aesni_generate1("dec") if(!$inline);
+ &function_begin_B("${PREFIX}_decrypt");
+ 	&mov	("eax",&wparam(0));
+ 	&mov	($key,&wparam(2));
+ 	&movups	($inout0,&QWP(0,"eax"));
+ 	&mov	($rounds,&DWP(240,$key));
+ 	&mov	("eax",&wparam(1));
+-	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt1");
++	if ($inline)
++	{   &aesni_inline_generate1("dec");	}
++	else
++	{   &call	("_aesni_decrypt1");	}
+ 	&movups	(&QWP(0,"eax"),$inout0);
+ 	&ret	();
+ &function_end_B("${PREFIX}_decrypt");
+-
+-# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
+-# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
+-# latency is 6, it turned out that it can be scheduled only every
+-# *second* cycle. Thus 3x interleave is the one providing optimal
++
++# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
++# factor. Why 3x subroutine were originally used in loops? Even though
++# aes[enc|dec] latency was originally 6, it could be scheduled only
++# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+ # utilization, i.e. when subroutine's throughput is virtually same as
+ # of non-interleaved subroutine [for number of input blocks up to 3].
+-# This is why it makes no sense to implement 2x subroutine. As soon
+-# as/if Intel improves throughput by making it possible to schedule
+-# the instructions in question *every* cycles I would have to
+-# implement 6x interleave and use it in loop...
++# This is why it makes no sense to implement 2x subroutine.
++# aes[enc|dec] latency in next processor generation is 8, but the
++# instructions can be scheduled every cycle. Optimal interleave for
++# new processor is therefore 8x, but it's unfeasible to accommodate it
++# in XMM registers addreassable in 32-bit mode and therefore 6x is
++# used instead...
++
+ sub aesni_generate3
+ { my $p=shift;
  
- 	aes${dir}	$rndkey1,$inout0
--	$movkey		($key),$rndkey0
- 	aes${dir}	$rndkey1,$inout1
- 	aes${dir}	$rndkey1,$inout2
- 	aes${dir}	$rndkey1,$inout3
-@@ -208,12 +358,158 @@ _aesni_${dir}rypt4:
- .size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
- ___
+@@ -148,24 +206,24 @@ sub aesni_generate3
+ 	&shr		($rounds,1);
+ 	&$movekey	($rndkey1,&QWP(16,$key));
+ 	&lea		($key,&DWP(32,$key));
+-	&pxor		($inout0,$rndkey0);
++	&xorps		($inout0,$rndkey0);
+ 	&pxor		($inout1,$rndkey0);
+ 	&pxor		($inout2,$rndkey0);
+-	&jmp		(&label("${p}3_loop"));
+-    &set_label("${p}3_loop",16);
+-	eval"&aes${p}	($inout0,$rndkey1)";
+ 	&$movekey	($rndkey0,&QWP(0,$key));
++
++    &set_label("${p}3_loop");
++	eval"&aes${p}	($inout0,$rndkey1)";
+ 	eval"&aes${p}	($inout1,$rndkey1)";
+ 	&dec		($rounds);
+ 	eval"&aes${p}	($inout2,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(16,$key));
+ 	eval"&aes${p}	($inout0,$rndkey0)";
+-	&lea		($key,&DWP(32,$key));
+ 	eval"&aes${p}	($inout1,$rndkey0)";
++	&lea		($key,&DWP(32,$key));
+ 	eval"&aes${p}	($inout2,$rndkey0)";
++	&$movekey	($rndkey0,&QWP(0,$key));
+ 	&jnz		(&label("${p}3_loop"));
+     eval"&aes${p}	($inout0,$rndkey1)";
+-    &$movekey		($rndkey0,&QWP(0,$key));
+     eval"&aes${p}	($inout1,$rndkey1)";
+     eval"&aes${p}	($inout2,$rndkey1)";
+     eval"&aes${p}last	($inout0,$rndkey0)";
+@@ -187,27 +245,28 @@ sub aesni_generate4
+ 	&$movekey	($rndkey1,&QWP(16,$key));
+ 	&shr		($rounds,1);
+ 	&lea		($key,&DWP(32,$key));
+-	&pxor		($inout0,$rndkey0);
++	&xorps		($inout0,$rndkey0);
+ 	&pxor		($inout1,$rndkey0);
+ 	&pxor		($inout2,$rndkey0);
+ 	&pxor		($inout3,$rndkey0);
+-	&jmp		(&label("${p}3_loop"));
+-    &set_label("${p}3_loop",16);
+-	eval"&aes${p}	($inout0,$rndkey1)";
+ 	&$movekey	($rndkey0,&QWP(0,$key));
++
++    &set_label("${p}4_loop");
++	eval"&aes${p}	($inout0,$rndkey1)";
+ 	eval"&aes${p}	($inout1,$rndkey1)";
+ 	&dec		($rounds);
+ 	eval"&aes${p}	($inout2,$rndkey1)";
+ 	eval"&aes${p}	($inout3,$rndkey1)";
+ 	&$movekey	($rndkey1,&QWP(16,$key));
+ 	eval"&aes${p}	($inout0,$rndkey0)";
+-	&lea		($key,&DWP(32,$key));
+ 	eval"&aes${p}	($inout1,$rndkey0)";
++	&lea		($key,&DWP(32,$key));
+ 	eval"&aes${p}	($inout2,$rndkey0)";
+ 	eval"&aes${p}	($inout3,$rndkey0)";
+-	&jnz		(&label("${p}3_loop"));
++	&$movekey	($rndkey0,&QWP(0,$key));
++    &jnz		(&label("${p}4_loop"));
++
+     eval"&aes${p}	($inout0,$rndkey1)";
+-    &$movekey		($rndkey0,&QWP(0,$key));
+     eval"&aes${p}	($inout1,$rndkey1)";
+     eval"&aes${p}	($inout2,$rndkey1)";
+     eval"&aes${p}	($inout3,$rndkey1)";
+@@ -218,12 +277,76 @@ sub aesni_generate4
+     &ret();
+     &function_end_B("_aesni_${p}rypt4");
  }
-+sub aesni_generate6 {
-+my $dir=shift;
-+# As already mentioned it takes in $key and $rounds, which are *not*
-+# preserved. $inout[0-5] is cipher/clear text...
-+$code.=<<___;
-+.type	_aesni_${dir}rypt6,\@abi-omnipotent
-+.align	16
-+_aesni_${dir}rypt6:
-+	$movkey		($key),$rndkey0
-+	shr		\$1,$rounds
-+	$movkey		16($key),$rndkey1
-+	lea		32($key),$key
-+	xorps		$rndkey0,$inout0
-+	pxor		$rndkey0,$inout1
-+	aes${dir}	$rndkey1,$inout0
-+	pxor		$rndkey0,$inout2
-+	aes${dir}	$rndkey1,$inout1
-+	pxor		$rndkey0,$inout3
-+	aes${dir}	$rndkey1,$inout2
-+	pxor		$rndkey0,$inout4
-+	aes${dir}	$rndkey1,$inout3
-+	pxor		$rndkey0,$inout5
-+	dec		$rounds
-+	aes${dir}	$rndkey1,$inout4
-+	$movkey		($key),$rndkey0
-+	aes${dir}	$rndkey1,$inout5
-+	jmp		.L${dir}_loop6_enter
-+.align	16
-+.L${dir}_loop6:
-+	aes${dir}	$rndkey1,$inout0
-+	aes${dir}	$rndkey1,$inout1
-+	dec		$rounds
-+	aes${dir}	$rndkey1,$inout2
-+	aes${dir}	$rndkey1,$inout3
-+	aes${dir}	$rndkey1,$inout4
-+	aes${dir}	$rndkey1,$inout5
-+.L${dir}_loop6_enter:				# happens to be 16-byte aligned
-+	$movkey		16($key),$rndkey1
-+	aes${dir}	$rndkey0,$inout0
-+	aes${dir}	$rndkey0,$inout1
-+	lea		32($key),$key
-+	aes${dir}	$rndkey0,$inout2
-+	aes${dir}	$rndkey0,$inout3
-+	aes${dir}	$rndkey0,$inout4
-+	aes${dir}	$rndkey0,$inout5
-+	$movkey		($key),$rndkey0
-+	jnz		.L${dir}_loop6
 +
-+	aes${dir}	$rndkey1,$inout0
-+	aes${dir}	$rndkey1,$inout1
-+	aes${dir}	$rndkey1,$inout2
-+	aes${dir}	$rndkey1,$inout3
-+	aes${dir}	$rndkey1,$inout4
-+	aes${dir}	$rndkey1,$inout5
-+	aes${dir}last	$rndkey0,$inout0
-+	aes${dir}last	$rndkey0,$inout1
-+	aes${dir}last	$rndkey0,$inout2
-+	aes${dir}last	$rndkey0,$inout3
-+	aes${dir}last	$rndkey0,$inout4
-+	aes${dir}last	$rndkey0,$inout5
-+	ret
-+.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
-+___
-+}
-+sub aesni_generate8 {
-+my $dir=shift;
-+# As already mentioned it takes in $key and $rounds, which are *not*
-+# preserved. $inout[0-7] is cipher/clear text...
-+$code.=<<___;
-+.type	_aesni_${dir}rypt8,\@abi-omnipotent
-+.align	16
-+_aesni_${dir}rypt8:
-+	$movkey		($key),$rndkey0
-+	shr		\$1,$rounds
-+	$movkey		16($key),$rndkey1
-+	lea		32($key),$key
-+	xorps		$rndkey0,$inout0
-+	xorps		$rndkey0,$inout1
-+	aes${dir}	$rndkey1,$inout0
-+	pxor		$rndkey0,$inout2
-+	aes${dir}	$rndkey1,$inout1
-+	pxor		$rndkey0,$inout3
-+	aes${dir}	$rndkey1,$inout2
-+	pxor		$rndkey0,$inout4
-+	aes${dir}	$rndkey1,$inout3
-+	pxor		$rndkey0,$inout5
-+	dec		$rounds
-+	aes${dir}	$rndkey1,$inout4
-+	pxor		$rndkey0,$inout6
-+	aes${dir}	$rndkey1,$inout5
-+	pxor		$rndkey0,$inout7
-+	$movkey		($key),$rndkey0
-+	aes${dir}	$rndkey1,$inout6
-+	aes${dir}	$rndkey1,$inout7
-+	$movkey		16($key),$rndkey1
-+	jmp		.L${dir}_loop8_enter
-+.align	16
-+.L${dir}_loop8:
-+	aes${dir}	$rndkey1,$inout0
-+	aes${dir}	$rndkey1,$inout1
-+	dec		$rounds
-+	aes${dir}	$rndkey1,$inout2
-+	aes${dir}	$rndkey1,$inout3
-+	aes${dir}	$rndkey1,$inout4
-+	aes${dir}	$rndkey1,$inout5
-+	aes${dir}	$rndkey1,$inout6
-+	aes${dir}	$rndkey1,$inout7
-+	$movkey		16($key),$rndkey1
-+.L${dir}_loop8_enter:				# happens to be 16-byte aligned
-+	aes${dir}	$rndkey0,$inout0
-+	aes${dir}	$rndkey0,$inout1
-+	lea		32($key),$key
-+	aes${dir}	$rndkey0,$inout2
-+	aes${dir}	$rndkey0,$inout3
-+	aes${dir}	$rndkey0,$inout4
-+	aes${dir}	$rndkey0,$inout5
-+	aes${dir}	$rndkey0,$inout6
-+	aes${dir}	$rndkey0,$inout7
-+	$movkey		($key),$rndkey0
-+	jnz		.L${dir}_loop8
++sub aesni_generate6
++{ my $p=shift;
++
++    &function_begin_B("_aesni_${p}rypt6");
++    &static_label("_aesni_${p}rypt6_enter");
++	&$movekey	($rndkey0,&QWP(0,$key));
++	&shr		($rounds,1);
++	&$movekey	($rndkey1,&QWP(16,$key));
++	&lea		($key,&DWP(32,$key));
++	&xorps		($inout0,$rndkey0);
++	&pxor		($inout1,$rndkey0);	# pxor does better here
++	eval"&aes${p}	($inout0,$rndkey1)";
++	&pxor		($inout2,$rndkey0);
++	eval"&aes${p}	($inout1,$rndkey1)";
++	&pxor		($inout3,$rndkey0);
++	&dec		($rounds);
++	eval"&aes${p}	($inout2,$rndkey1)";
++	&pxor		($inout4,$rndkey0);
++	eval"&aes${p}	($inout3,$rndkey1)";
++	&pxor		($inout5,$rndkey0);
++	eval"&aes${p}	($inout4,$rndkey1)";
++	&$movekey	($rndkey0,&QWP(0,$key));
++	eval"&aes${p}	($inout5,$rndkey1)";
++	&jmp		(&label("_aesni_${p}rypt6_enter"));
++
++    &set_label("${p}6_loop",16);
++	eval"&aes${p}	($inout0,$rndkey1)";
++	eval"&aes${p}	($inout1,$rndkey1)";
++	&dec		($rounds);
++	eval"&aes${p}	($inout2,$rndkey1)";
++	eval"&aes${p}	($inout3,$rndkey1)";
++	eval"&aes${p}	($inout4,$rndkey1)";
++	eval"&aes${p}	($inout5,$rndkey1)";
++    &set_label("_aesni_${p}rypt6_enter",16);
++	&$movekey	($rndkey1,&QWP(16,$key));
++	eval"&aes${p}	($inout0,$rndkey0)";
++	eval"&aes${p}	($inout1,$rndkey0)";
++	&lea		($key,&DWP(32,$key));
++	eval"&aes${p}	($inout2,$rndkey0)";
++	eval"&aes${p}	($inout3,$rndkey0)";
++	eval"&aes${p}	($inout4,$rndkey0)";
++	eval"&aes${p}	($inout5,$rndkey0)";
++	&$movekey	($rndkey0,&QWP(0,$key));
++    &jnz		(&label("${p}6_loop"));
 +
-+	aes${dir}	$rndkey1,$inout0
-+	aes${dir}	$rndkey1,$inout1
-+	aes${dir}	$rndkey1,$inout2
-+	aes${dir}	$rndkey1,$inout3
-+	aes${dir}	$rndkey1,$inout4
-+	aes${dir}	$rndkey1,$inout5
-+	aes${dir}	$rndkey1,$inout6
-+	aes${dir}	$rndkey1,$inout7
-+	aes${dir}last	$rndkey0,$inout0
-+	aes${dir}last	$rndkey0,$inout1
-+	aes${dir}last	$rndkey0,$inout2
-+	aes${dir}last	$rndkey0,$inout3
-+	aes${dir}last	$rndkey0,$inout4
-+	aes${dir}last	$rndkey0,$inout5
-+	aes${dir}last	$rndkey0,$inout6
-+	aes${dir}last	$rndkey0,$inout7
-+	ret
-+.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
-+___
++    eval"&aes${p}	($inout0,$rndkey1)";
++    eval"&aes${p}	($inout1,$rndkey1)";
++    eval"&aes${p}	($inout2,$rndkey1)";
++    eval"&aes${p}	($inout3,$rndkey1)";
++    eval"&aes${p}	($inout4,$rndkey1)";
++    eval"&aes${p}	($inout5,$rndkey1)";
++    eval"&aes${p}last	($inout0,$rndkey0)";
++    eval"&aes${p}last	($inout1,$rndkey0)";
++    eval"&aes${p}last	($inout2,$rndkey0)";
++    eval"&aes${p}last	($inout3,$rndkey0)";
++    eval"&aes${p}last	($inout4,$rndkey0)";
++    eval"&aes${p}last	($inout5,$rndkey0)";
++    &ret();
++    &function_end_B("_aesni_${p}rypt6");
 +}
  &aesni_generate3("enc") if ($PREFIX eq "aesni");
  &aesni_generate3("dec");
  &aesni_generate4("enc") if ($PREFIX eq "aesni");
  &aesni_generate4("dec");
+-
 +&aesni_generate6("enc") if ($PREFIX eq "aesni");
 +&aesni_generate6("dec");
-+&aesni_generate8("enc") if ($PREFIX eq "aesni");
-+&aesni_generate8("dec");
- 
++
  if ($PREFIX eq "aesni") {
-+########################################################################
++######################################################################
  # void aesni_ecb_encrypt (const void *in, void *out,
- #			  size_t length, const AES_KEY *key,
- #			  int enc);
-@@ -222,54 +518,98 @@ $code.=<<___;
- .type	aesni_ecb_encrypt,\@function,5
- .align	16
- aesni_ecb_encrypt:
--	cmp	\$16,$len		# check length
--	jb	.Lecb_ret
--
--	mov	240($key),$rounds	# pull $rounds
- 	and	\$-16,$len
-+	jz	.Lecb_ret
-+
-+	mov	240($key),$rounds	# key->rounds
-+	$movkey	($key),$rndkey0
- 	mov	$key,$key_		# backup $key
--	test	%r8d,%r8d		# 5th argument
- 	mov	$rounds,$rnds_		# backup $rounds
-+	test	%r8d,%r8d		# 5th argument
- 	jz	.Lecb_decrypt
- #--------------------------- ECB ENCRYPT ------------------------------#
--	sub	\$0x40,$len
--	jbe	.Lecb_enc_tail
--	jmp	.Lecb_enc_loop3
-+	cmp	\$0x80,$len
-+	jb	.Lecb_enc_tail
+ #                         size_t length, const AES_KEY *key,
+ #                         int enc);
+@@ -232,62 +355,93 @@ if ($PREFIX eq "aesni") {
+ 	&mov	($out,&wparam(1));
+ 	&mov	($len,&wparam(2));
+ 	&mov	($key,&wparam(3));
+-	&mov	($rounds,&wparam(4));
+-	&cmp	($len,16);
+-	&jb	(&label("ecb_ret"));
++	&mov	($rounds_,&wparam(4));
+ 	&and	($len,-16);
+-	&test	($rounds,$rounds)
++	&jz	(&label("ecb_ret"));
+ 	&mov	($rounds,&DWP(240,$key));
++	&test	($rounds_,$rounds_);
++	&jz	(&label("ecb_decrypt"));
 +
-+	movdqu	($inp),$inout0
-+	movdqu	0x10($inp),$inout1
-+	movdqu	0x20($inp),$inout2
-+	movdqu	0x30($inp),$inout3
-+	movdqu	0x40($inp),$inout4
-+	movdqu	0x50($inp),$inout5
-+	movdqu	0x60($inp),$inout6
-+	movdqu	0x70($inp),$inout7
-+	lea	0x80($inp),$inp
-+	sub	\$0x80,$len
-+	jmp	.Lecb_enc_loop8_enter
- .align 16
--.Lecb_enc_loop3:
--	movups	($inp),$inout0
--	movups	0x10($inp),$inout1
--	movups	0x20($inp),$inout2
--	call	_aesni_encrypt3
--	sub	\$0x30,$len
--	lea	0x30($inp),$inp
--	lea	0x30($out),$out
--	movups	$inout0,-0x30($out)
--	mov	$rnds_,$rounds		# restore $rounds
--	movups	$inout1,-0x20($out)
-+.Lecb_enc_loop8:
-+	movups	$inout0,($out)
- 	mov	$key_,$key		# restore $key
--	movups	$inout2,-0x10($out)
--	ja	.Lecb_enc_loop3
-+	movdqu	($inp),$inout0
-+	mov	$rnds_,$rounds		# restore $rounds
-+	movups	$inout1,0x10($out)
-+	movdqu	0x10($inp),$inout1
-+	movups	$inout2,0x20($out)
-+	movdqu	0x20($inp),$inout2
-+	movups	$inout3,0x30($out)
-+	movdqu	0x30($inp),$inout3
-+	movups	$inout4,0x40($out)
-+	movdqu	0x40($inp),$inout4
-+	movups	$inout5,0x50($out)
-+	movdqu	0x50($inp),$inout5
-+	movups	$inout6,0x60($out)
-+	movdqu	0x60($inp),$inout6
-+	movups	$inout7,0x70($out)
-+	lea	0x80($out),$out
-+	movdqu	0x70($inp),$inout7
-+	lea	0x80($inp),$inp
-+.Lecb_enc_loop8_enter:
+ 	&mov	($key_,$key);		# backup $key
+ 	&mov	($rounds_,$rounds);	# backup $rounds
+-	&jz	(&label("ecb_decrypt"));
++	&cmp	($len,0x60);
++	&jb	(&label("ecb_enc_tail"));
  
--.Lecb_enc_tail:
--	add	\$0x40,$len
-+	call	_aesni_encrypt8
+-	&sub	($len,0x40);
+-	&jbe	(&label("ecb_enc_tail"));
+-	&jmp	(&label("ecb_enc_loop3"));
++	&movdqu	($inout0,&QWP(0,$inp));
++	&movdqu	($inout1,&QWP(0x10,$inp));
++	&movdqu	($inout2,&QWP(0x20,$inp));
++	&movdqu	($inout3,&QWP(0x30,$inp));
++	&movdqu	($inout4,&QWP(0x40,$inp));
++	&movdqu	($inout5,&QWP(0x50,$inp));
++	&lea	($inp,&DWP(0x60,$inp));
++	&sub	($len,0x60);
++	&jmp	(&label("ecb_enc_loop6_enter"));
 +
-+	sub	\$0x80,$len
-+	jnc	.Lecb_enc_loop8
++&set_label("ecb_enc_loop6",16);
++	&movups	(&QWP(0,$out),$inout0);
++	&movdqu	($inout0,&QWP(0,$inp));
++	&movups	(&QWP(0x10,$out),$inout1);
++	&movdqu	($inout1,&QWP(0x10,$inp));
++	&movups	(&QWP(0x20,$out),$inout2);
++	&movdqu	($inout2,&QWP(0x20,$inp));
++	&movups	(&QWP(0x30,$out),$inout3);
++	&movdqu	($inout3,&QWP(0x30,$inp));
++	&movups	(&QWP(0x40,$out),$inout4);
++	&movdqu	($inout4,&QWP(0x40,$inp));
++	&movups	(&QWP(0x50,$out),$inout5);
++	&lea	($out,&DWP(0x60,$out));
++	&movdqu	($inout5,&QWP(0x50,$inp));
++	&lea	($inp,&DWP(0x60,$inp));
++&set_label("ecb_enc_loop6_enter");
 +
-+	movups	$inout0,($out)
-+	mov	$key_,$key		# restore $key
-+	movups	$inout1,0x10($out)
-+	mov	$rnds_,$rounds		# restore $rounds
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	movups	$inout5,0x50($out)
-+	movups	$inout6,0x60($out)
-+	movups	$inout7,0x70($out)
-+	lea	0x80($out),$out
-+	add	\$0x80,$len
- 	jz	.Lecb_ret
++	&call	("_aesni_encrypt6");
+ 
+-&set_label("ecb_enc_loop3",16);
+-	&movups	($inout0,&QWP(0,$inp));
+-	&movups	($inout1,&QWP(0x10,$inp));
+-	&movups	($inout2,&QWP(0x20,$inp));
+-	&call	("_aesni_encrypt3");
+-	&sub	($len,0x30);
+-	&lea	($inp,&DWP(0x30,$inp));
+-	&lea	($out,&DWP(0x30,$out));
+-	&movups	(&QWP(-0x30,$out),$inout0);
+ 	&mov	($key,$key_);		# restore $key
+-	&movups	(&QWP(-0x20,$out),$inout1);
+ 	&mov	($rounds,$rounds_);	# restore $rounds
+-	&movups	(&QWP(-0x10,$out),$inout2);
+-	&ja	(&label("ecb_enc_loop3"));
++	&sub	($len,0x60);
++	&jnc	(&label("ecb_enc_loop6"));
+ 
+-&set_label("ecb_enc_tail");
+-	&add	($len,0x40);
++	&movups	(&QWP(0,$out),$inout0);
++	&movups	(&QWP(0x10,$out),$inout1);
++	&movups	(&QWP(0x20,$out),$inout2);
++	&movups	(&QWP(0x30,$out),$inout3);
++	&movups	(&QWP(0x40,$out),$inout4);
++	&movups	(&QWP(0x50,$out),$inout5);
++	&lea	($out,&DWP(0x60,$out));
++	&add	($len,0x60);
+ 	&jz	(&label("ecb_ret"));
+ 
+-	&cmp	($len,0x10);
++&set_label("ecb_enc_tail");
+ 	&movups	($inout0,&QWP(0,$inp));
+-	&je	(&label("ecb_enc_one"));
+ 	&cmp	($len,0x20);
++	&jb	(&label("ecb_enc_one"));
+ 	&movups	($inout1,&QWP(0x10,$inp));
+ 	&je	(&label("ecb_enc_two"));
+-	&cmp	($len,0x30);
+ 	&movups	($inout2,&QWP(0x20,$inp));
+-	&je	(&label("ecb_enc_three"));
++	&cmp	($len,0x40);
++	&jb	(&label("ecb_enc_three"));
+ 	&movups	($inout3,&QWP(0x30,$inp));
+-	&call	("_aesni_encrypt4");
++	&je	(&label("ecb_enc_four"));
++	&movups	($inout4,&QWP(0x40,$inp));
++	&xorps	($inout5,$inout5);
++	&call	("_aesni_encrypt6");
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&movups	(&QWP(0x10,$out),$inout1);
+ 	&movups	(&QWP(0x20,$out),$inout2);
+ 	&movups	(&QWP(0x30,$out),$inout3);
++	&movups	(&QWP(0x40,$out),$inout4);
+ 	jmp	(&label("ecb_ret"));
+ 
+ &set_label("ecb_enc_one",16);
+-	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt1");
++	if ($inline)
++	{   &aesni_inline_generate1("enc");	}
++	else
++	{   &call	("_aesni_encrypt1");	}
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&jmp	(&label("ecb_ret"));
  
--	cmp	\$0x10,$len
-+.Lecb_enc_tail:
- 	movups	($inp),$inout0
--	je	.Lecb_enc_one
- 	cmp	\$0x20,$len
-+	jb	.Lecb_enc_one
- 	movups	0x10($inp),$inout1
- 	je	.Lecb_enc_two
--	cmp	\$0x30,$len
- 	movups	0x20($inp),$inout2
--	je	.Lecb_enc_three
-+	cmp	\$0x40,$len
-+	jb	.Lecb_enc_three
- 	movups	0x30($inp),$inout3
--	call	_aesni_encrypt4
-+	je	.Lecb_enc_four
-+	movups	0x40($inp),$inout4
-+	cmp	\$0x60,$len
-+	jb	.Lecb_enc_five
-+	movups	0x50($inp),$inout5
-+	je	.Lecb_enc_six
-+	movdqu	0x60($inp),$inout6
-+	call	_aesni_encrypt8
- 	movups	$inout0,($out)
- 	movups	$inout1,0x10($out)
- 	movups	$inout2,0x20($out)
- 	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	movups	$inout5,0x50($out)
-+	movups	$inout6,0x60($out)
- 	jmp	.Lecb_ret
- .align	16
- .Lecb_enc_one:
-@@ -280,6 +620,7 @@ $code.=<<___;
- 	jmp	.Lecb_ret
- .align	16
- .Lecb_enc_two:
-+	xorps	$inout2,$inout2
- 	call	_aesni_encrypt3
- 	movups	$inout0,($out)
- 	movups	$inout1,0x10($out)
-@@ -291,47 +632,121 @@ $code.=<<___;
- 	movups	$inout1,0x10($out)
- 	movups	$inout2,0x20($out)
- 	jmp	.Lecb_ret
-+.align	16
-+.Lecb_enc_four:
-+	call	_aesni_encrypt4
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	jmp	.Lecb_ret
-+.align	16
-+.Lecb_enc_five:
-+	xorps	$inout5,$inout5
-+	call	_aesni_encrypt6
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	jmp	.Lecb_ret
-+.align	16
-+.Lecb_enc_six:
-+	call	_aesni_encrypt6
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	movups	$inout5,0x50($out)
-+	jmp	.Lecb_ret
- #--------------------------- ECB DECRYPT ------------------------------#
- .align	16
- .Lecb_decrypt:
--	sub	\$0x40,$len
--	jbe	.Lecb_dec_tail
--	jmp	.Lecb_dec_loop3
-+	cmp	\$0x80,$len
-+	jb	.Lecb_dec_tail
+ &set_label("ecb_enc_two",16);
++	&xorps	($inout2,$inout2);
+ 	&call	("_aesni_encrypt3");
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&movups	(&QWP(0x10,$out),$inout1);
+@@ -300,53 +454,95 @@ if ($PREFIX eq "aesni") {
+ 	&movups	(&QWP(0x20,$out),$inout2);
+ 	&jmp	(&label("ecb_ret"));
+ 
++&set_label("ecb_enc_four",16);
++	&call	("_aesni_encrypt4");
++	&movups	(&QWP(0,$out),$inout0);
++	&movups	(&QWP(0x10,$out),$inout1);
++	&movups	(&QWP(0x20,$out),$inout2);
++	&movups	(&QWP(0x30,$out),$inout3);
++	&jmp	(&label("ecb_ret"));
++######################################################################
+ &set_label("ecb_decrypt",16);
+-	&sub	($len,0x40);
+-	&jbe	(&label("ecb_dec_tail"));
+-	&jmp	(&label("ecb_dec_loop3"));
++	&mov	($key_,$key);		# backup $key
++	&mov	($rounds_,$rounds);	# backup $rounds
++	&cmp	($len,0x60);
++	&jb	(&label("ecb_dec_tail"));
 +
-+	movdqu	($inp),$inout0
-+	movdqu	0x10($inp),$inout1
-+	movdqu	0x20($inp),$inout2
-+	movdqu	0x30($inp),$inout3
-+	movdqu	0x40($inp),$inout4
-+	movdqu	0x50($inp),$inout5
-+	movdqu	0x60($inp),$inout6
-+	movdqu	0x70($inp),$inout7
-+	lea	0x80($inp),$inp
-+	sub	\$0x80,$len
-+	jmp	.Lecb_dec_loop8_enter
- .align 16
--.Lecb_dec_loop3:
--	movups	($inp),$inout0
--	movups	0x10($inp),$inout1
--	movups	0x20($inp),$inout2
--	call	_aesni_decrypt3
--	sub	\$0x30,$len
--	lea	0x30($inp),$inp
--	lea	0x30($out),$out
--	movups	$inout0,-0x30($out)
--	mov	$rnds_,$rounds		# restore $rounds
--	movups	$inout1,-0x20($out)
-+.Lecb_dec_loop8:
-+	movups	$inout0,($out)
- 	mov	$key_,$key		# restore $key
--	movups	$inout2,-0x10($out)
--	ja	.Lecb_dec_loop3
-+	movdqu	($inp),$inout0
-+	mov	$rnds_,$rounds		# restore $rounds
-+	movups	$inout1,0x10($out)
-+	movdqu	0x10($inp),$inout1
-+	movups	$inout2,0x20($out)
-+	movdqu	0x20($inp),$inout2
-+	movups	$inout3,0x30($out)
-+	movdqu	0x30($inp),$inout3
-+	movups	$inout4,0x40($out)
-+	movdqu	0x40($inp),$inout4
-+	movups	$inout5,0x50($out)
-+	movdqu	0x50($inp),$inout5
-+	movups	$inout6,0x60($out)
-+	movdqu	0x60($inp),$inout6
-+	movups	$inout7,0x70($out)
-+	lea	0x80($out),$out
-+	movdqu	0x70($inp),$inout7
-+	lea	0x80($inp),$inp
-+.Lecb_dec_loop8_enter:
++	&movdqu	($inout0,&QWP(0,$inp));
++	&movdqu	($inout1,&QWP(0x10,$inp));
++	&movdqu	($inout2,&QWP(0x20,$inp));
++	&movdqu	($inout3,&QWP(0x30,$inp));
++	&movdqu	($inout4,&QWP(0x40,$inp));
++	&movdqu	($inout5,&QWP(0x50,$inp));
++	&lea	($inp,&DWP(0x60,$inp));
++	&sub	($len,0x60);
++	&jmp	(&label("ecb_dec_loop6_enter"));
 +
-+	call	_aesni_decrypt8
++&set_label("ecb_dec_loop6",16);
++	&movups	(&QWP(0,$out),$inout0);
++	&movdqu	($inout0,&QWP(0,$inp));
++	&movups	(&QWP(0x10,$out),$inout1);
++	&movdqu	($inout1,&QWP(0x10,$inp));
++	&movups	(&QWP(0x20,$out),$inout2);
++	&movdqu	($inout2,&QWP(0x20,$inp));
++	&movups	(&QWP(0x30,$out),$inout3);
++	&movdqu	($inout3,&QWP(0x30,$inp));
++	&movups	(&QWP(0x40,$out),$inout4);
++	&movdqu	($inout4,&QWP(0x40,$inp));
++	&movups	(&QWP(0x50,$out),$inout5);
++	&lea	($out,&DWP(0x60,$out));
++	&movdqu	($inout5,&QWP(0x50,$inp));
++	&lea	($inp,&DWP(0x60,$inp));
++&set_label("ecb_dec_loop6_enter");
 +
-+	$movkey	($key_),$rndkey0
-+	sub	\$0x80,$len
-+	jnc	.Lecb_dec_loop8
++	&call	("_aesni_decrypt6");
  
--.Lecb_dec_tail:
--	add	\$0x40,$len
-+	movups	$inout0,($out)
-+	mov	$key_,$key		# restore $key
-+	movups	$inout1,0x10($out)
-+	mov	$rnds_,$rounds		# restore $rounds
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	movups	$inout5,0x50($out)
-+	movups	$inout6,0x60($out)
-+	movups	$inout7,0x70($out)
-+	lea	0x80($out),$out
-+	add	\$0x80,$len
- 	jz	.Lecb_ret
+-&set_label("ecb_dec_loop3",16);
+-	&movups	($inout0,&QWP(0,$inp));
+-	&movups	($inout1,&QWP(0x10,$inp));
+-	&movups	($inout2,&QWP(0x20,$inp));
+-	&call	("_aesni_decrypt3");
+-	&sub	($len,0x30);
+-	&lea	($inp,&DWP(0x30,$inp));
+-	&lea	($out,&DWP(0x30,$out));
+-	&movups	(&QWP(-0x30,$out),$inout0);
+ 	&mov	($key,$key_);		# restore $key
+-	&movups	(&QWP(-0x20,$out),$inout1);
+ 	&mov	($rounds,$rounds_);	# restore $rounds
+-	&movups	(&QWP(-0x10,$out),$inout2);
+-	&ja	(&label("ecb_dec_loop3"));
++	&sub	($len,0x60);
++	&jnc	(&label("ecb_dec_loop6"));
+ 
+-&set_label("ecb_dec_tail");
+-	&add	($len,0x40);
++	&movups	(&QWP(0,$out),$inout0);
++	&movups	(&QWP(0x10,$out),$inout1);
++	&movups	(&QWP(0x20,$out),$inout2);
++	&movups	(&QWP(0x30,$out),$inout3);
++	&movups	(&QWP(0x40,$out),$inout4);
++	&movups	(&QWP(0x50,$out),$inout5);
++	&lea	($out,&DWP(0x60,$out));
++	&add	($len,0x60);
+ 	&jz	(&label("ecb_ret"));
+ 
+-	&cmp	($len,0x10);
++&set_label("ecb_dec_tail");
+ 	&movups	($inout0,&QWP(0,$inp));
+-	&je	(&label("ecb_dec_one"));
+ 	&cmp	($len,0x20);
++	&jb	(&label("ecb_dec_one"));
+ 	&movups	($inout1,&QWP(0x10,$inp));
+ 	&je	(&label("ecb_dec_two"));
+-	&cmp	($len,0x30);
+ 	&movups	($inout2,&QWP(0x20,$inp));
+-	&je	(&label("ecb_dec_three"));
++	&cmp	($len,0x40);
++	&jb	(&label("ecb_dec_three"));
+ 	&movups	($inout3,&QWP(0x30,$inp));
+-	&call	("_aesni_decrypt4");
++	&je	(&label("ecb_dec_four"));
++	&movups	($inout4,&QWP(0x40,$inp));
++	&xorps	($inout5,$inout5);
++	&call	("_aesni_decrypt6");
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&movups	(&QWP(0x10,$out),$inout1);
+ 	&movups	(&QWP(0x20,$out),$inout2);
+ 	&movups	(&QWP(0x30,$out),$inout3);
++	&movups	(&QWP(0x40,$out),$inout4);
+ 	&jmp	(&label("ecb_ret"));
+ 
+ &set_label("ecb_dec_one",16);
+-	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt3");
++	if ($inline)
++	{   &aesni_inline_generate1("dec");	}
++	else
++	{   &call	("_aesni_decrypt1");	}
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&jmp	(&label("ecb_ret"));
  
--	cmp	\$0x10,$len
-+.Lecb_dec_tail:
- 	movups	($inp),$inout0
--	je	.Lecb_dec_one
- 	cmp	\$0x20,$len
-+	jb	.Lecb_dec_one
- 	movups	0x10($inp),$inout1
- 	je	.Lecb_dec_two
--	cmp	\$0x30,$len
- 	movups	0x20($inp),$inout2
--	je	.Lecb_dec_three
-+	cmp	\$0x40,$len
-+	jb	.Lecb_dec_three
- 	movups	0x30($inp),$inout3
--	call	_aesni_decrypt4
-+	je	.Lecb_dec_four
-+	movups	0x40($inp),$inout4
-+	cmp	\$0x60,$len
-+	jb	.Lecb_dec_five
-+	movups	0x50($inp),$inout5
-+	je	.Lecb_dec_six
-+	movups	0x60($inp),$inout6
-+	$movkey	($key),$rndkey0
-+	call	_aesni_decrypt8
- 	movups	$inout0,($out)
- 	movups	$inout1,0x10($out)
- 	movups	$inout2,0x20($out)
- 	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	movups	$inout5,0x50($out)
-+	movups	$inout6,0x60($out)
- 	jmp	.Lecb_ret
- .align	16
- .Lecb_dec_one:
-@@ -342,6 +757,7 @@ $code.=<<___;
- 	jmp	.Lecb_ret
- .align	16
- .Lecb_dec_two:
-+	xorps	$inout2,$inout2
- 	call	_aesni_decrypt3
- 	movups	$inout0,($out)
- 	movups	$inout1,0x10($out)
-@@ -352,6 +768,34 @@ $code.=<<___;
- 	movups	$inout0,($out)
- 	movups	$inout1,0x10($out)
- 	movups	$inout2,0x20($out)
-+	jmp	.Lecb_ret
-+.align	16
-+.Lecb_dec_four:
-+	call	_aesni_decrypt4
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	jmp	.Lecb_ret
-+.align	16
-+.Lecb_dec_five:
-+	xorps	$inout5,$inout5
-+	call	_aesni_decrypt6
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	jmp	.Lecb_ret
-+.align	16
-+.Lecb_dec_six:
-+	call	_aesni_decrypt6
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	movups	$inout5,0x50($out)
+ &set_label("ecb_dec_two",16);
++	&xorps	($inout2,$inout2);
+ 	&call	("_aesni_decrypt3");
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&movups	(&QWP(0x10,$out),$inout1);
+@@ -357,28 +553,42 @@ if ($PREFIX eq "aesni") {
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&movups	(&QWP(0x10,$out),$inout1);
+ 	&movups	(&QWP(0x20,$out),$inout2);
++	&jmp	(&label("ecb_ret"));
++
++&set_label("ecb_dec_four",16);
++	&call	("_aesni_decrypt4");
++	&movups	(&QWP(0,$out),$inout0);
++	&movups	(&QWP(0x10,$out),$inout1);
++	&movups	(&QWP(0x20,$out),$inout2);
++	&movups	(&QWP(0x30,$out),$inout3);
  
- .Lecb_ret:
- 	ret
-@@ -362,7 +806,8 @@ ___
+ &set_label("ecb_ret");
+ &function_end("aesni_ecb_encrypt");
+ }
+ 
++######################################################################
  # void $PREFIX_cbc_encrypt (const void *inp, void *out,
- #			    size_t length, const AES_KEY *key,
- #			    unsigned char *ivp,const int enc);
--$reserved = $win64?0x40:-0x18;	# used in decrypt
-+{
-+my $reserved = $win64?0x40:-0x18;	# used in decrypt
- $code.=<<___;
- .globl	${PREFIX}_cbc_encrypt
- .type	${PREFIX}_cbc_encrypt,\@function,6
-@@ -371,30 +816,30 @@ ${PREFIX}_cbc_encrypt:
- 	test	$len,$len		# check length
- 	jz	.Lcbc_ret
+ #                           size_t length, const AES_KEY *key,
+ #                           unsigned char *ivp,const int enc);
+ &function_begin("${PREFIX}_cbc_encrypt");
+ 	&mov	($inp,&wparam(0));
++	&mov	($rounds_,"esp");
+ 	&mov	($out,&wparam(1));
++	&sub	($rounds_,24);
+ 	&mov	($len,&wparam(2));
++	&and	($rounds_,-16);
+ 	&mov	($key,&wparam(3));
+-	&test	($len,$len);
+ 	&mov	($key_,&wparam(4));
+-	&jz	(&label("cbc_ret"));
++	&test	($len,$len);
++	&jz	(&label("cbc_abort"));
  
--	mov	240($key),$rnds_	# pull $rounds
-+	mov	240($key),$rnds_	# key->rounds
- 	mov	$key,$key_		# backup $key
- 	test	%r9d,%r9d		# 6th argument
- 	jz	.Lcbc_decrypt
- #--------------------------- CBC ENCRYPT ------------------------------#
- 	movups	($ivp),$inout0		# load iv as initial state
--	cmp	\$16,$len
- 	mov	$rnds_,$rounds
-+	cmp	\$16,$len
- 	jb	.Lcbc_enc_tail
- 	sub	\$16,$len
- 	jmp	.Lcbc_enc_loop
--.align 16
-+.align	16
- .Lcbc_enc_loop:
- 	movups	($inp),$inout1		# load input
- 	lea	16($inp),$inp
--	pxor	$inout1,$inout0
-+	#xorps	$inout1,$inout0
- ___
--	&aesni_generate1("enc",$key,$rounds);
-+	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
- $code.=<<___;
--	sub	\$16,$len
--	lea	16($out),$out
- 	mov	$rnds_,$rounds		# restore $rounds
- 	mov	$key_,$key		# restore $key
--	movups	$inout0,-16($out)	# store output
-+	movups	$inout0,0($out)		# store output
-+	lea	16($out),$out
-+	sub	\$16,$len
- 	jnc	.Lcbc_enc_loop
- 	add	\$16,$len
- 	jnz	.Lcbc_enc_tail
-@@ -429,92 +874,238 @@ $code.=<<___ if ($win64);
- ___
- $code.=<<___;
- 	movups	($ivp),$iv
--	sub	\$0x40,$len
- 	mov	$rnds_,$rounds
-+	cmp	\$0x70,$len
- 	jbe	.Lcbc_dec_tail
--	jmp	.Lcbc_dec_loop3
--.align 16
--.Lcbc_dec_loop3:
--	movups	($inp),$inout0
-+	shr	\$1,$rnds_
-+	sub	\$0x70,$len
-+	mov	$rnds_,$rounds
-+	movaps	$iv,$reserved(%rsp)
-+	jmp	.Lcbc_dec_loop8_enter
-+.align	16
-+.Lcbc_dec_loop8:
-+	movaps	$rndkey0,$reserved(%rsp)	# save IV
-+	movups	$inout7,($out)
-+	lea	0x10($out),$out
-+.Lcbc_dec_loop8_enter:
-+	$movkey		($key),$rndkey0
-+	movups	($inp),$inout0			# load input
- 	movups	0x10($inp),$inout1
--	movups	0x20($inp),$inout2
--	movaps	$inout0,$in0
--	movaps	$inout1,$in1
--	movaps	$inout2,$in2
--	call	_aesni_decrypt3
--	sub	\$0x30,$len
--	lea	0x30($inp),$inp
--	lea	0x30($out),$out
--	pxor	$iv,$inout0
--	pxor	$in0,$inout1
--	movaps	$in2,$iv
--	pxor	$in1,$inout2
--	movups	$inout0,-0x30($out)
--	mov	$rnds_,$rounds	# restore $rounds
--	movups	$inout1,-0x20($out)
--	mov	$key_,$key	# restore $key
--	movups	$inout2,-0x10($out)
--	ja	.Lcbc_dec_loop3
-+	$movkey		16($key),$rndkey1
+ 	&cmp	(&wparam(5),0);
+-	&movups	($ivec,&QWP(0,$key_));	# load IV
++	&xchg	($rounds_,"esp");		# alloca
++	&movups	($ivec,&QWP(0,$key_));		# load IV
+ 	&mov	($rounds,&DWP(240,$key));
+-	&mov	($key_,$key);		# backup $key
+-	&mov	($rounds_,$rounds);	# backup $rounds
++	&mov	($key_,$key);			# backup $key
++	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
++	&mov	($rounds_,$rounds);		# backup $rounds
+ 	&je	(&label("cbc_decrypt"));
  
--.Lcbc_dec_tail:
--	add	\$0x40,$len
--	movups	$iv,($ivp)
--	jz	.Lcbc_dec_ret
-+	lea		32($key),$key
-+	movdqu	0x20($inp),$inout2
-+	xorps		$rndkey0,$inout0
-+	movdqu	0x30($inp),$inout3
-+	xorps		$rndkey0,$inout1
-+	movdqu	0x40($inp),$inout4
-+	aesdec		$rndkey1,$inout0
-+	pxor		$rndkey0,$inout2
-+	movdqu	0x50($inp),$inout5
-+	aesdec		$rndkey1,$inout1
-+	pxor		$rndkey0,$inout3
-+	movdqu	0x60($inp),$inout6
-+	aesdec		$rndkey1,$inout2
-+	pxor		$rndkey0,$inout4
-+	movdqu	0x70($inp),$inout7
-+	aesdec		$rndkey1,$inout3
-+	pxor		$rndkey0,$inout5
-+	dec		$rounds
-+	aesdec		$rndkey1,$inout4
-+	pxor		$rndkey0,$inout6
-+	aesdec		$rndkey1,$inout5
-+	pxor		$rndkey0,$inout7
-+	$movkey		($key),$rndkey0
-+	aesdec		$rndkey1,$inout6
-+	aesdec		$rndkey1,$inout7
-+	$movkey		16($key),$rndkey1
-+
-+	call		.Ldec_loop8_enter
+ 	&movaps	($inout0,$ivec);
+@@ -388,15 +598,17 @@ if ($PREFIX eq "aesni") {
+ 	&jmp	(&label("cbc_enc_loop"));
  
-+	movups	($inp),$rndkey1		# re-load input
-+	movups	0x10($inp),$rndkey0
-+	xorps	$reserved(%rsp),$inout0	# ^= IV
-+	xorps	$rndkey1,$inout1
-+	movups	0x20($inp),$rndkey1
-+	xorps	$rndkey0,$inout2
-+	movups	0x30($inp),$rndkey0
-+	xorps	$rndkey1,$inout3
-+	movups	0x40($inp),$rndkey1
-+	xorps	$rndkey0,$inout4
-+	movups	0x50($inp),$rndkey0
-+	xorps	$rndkey1,$inout5
-+	movups	0x60($inp),$rndkey1
-+	xorps	$rndkey0,$inout6
-+	movups	0x70($inp),$rndkey0	# IV
-+	xorps	$rndkey1,$inout7
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	mov	$rnds_,$rounds		# restore $rounds
-+	movups	$inout4,0x40($out)
-+	mov	$key_,$key		# restore $key
-+	movups	$inout5,0x50($out)
-+	lea	0x80($inp),$inp
-+	movups	$inout6,0x60($out)
-+	lea	0x70($out),$out
-+	sub	\$0x80,$len
-+	ja	.Lcbc_dec_loop8
+ &set_label("cbc_enc_loop",16);
+-	&movups	($ivec,&QWP(0,$inp));
++	&movups	($ivec,&QWP(0,$inp));		# input actually
+ 	&lea	($inp,&DWP(16,$inp));
+-	&pxor	($inout0,$ivec);
+-	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt3");
+-	&sub	($len,16);
+-	&lea	($out,&DWP(16,$out));
++	if ($inline)
++	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
++	else
++	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
+ 	&mov	($rounds,$rounds_);	# restore $rounds
+ 	&mov	($key,$key_);		# restore $key
+-	&movups	(&QWP(-16,$out),$inout0);
++	&movups	(&QWP(0,$out),$inout0);	# store output
++	&lea	($out,&DWP(16,$out));
++	&sub	($len,16);
+ 	&jnc	(&label("cbc_enc_loop"));
+ 	&add	($len,16);
+ 	&jnz	(&label("cbc_enc_tail"));
+@@ -415,90 +627,151 @@ if ($PREFIX eq "aesni") {
+ 	&mov	($inp,$out);		# $inp and $out are the same
+ 	&mov	($key,$key_);		# restore $key
+ 	&jmp	(&label("cbc_enc_loop"));
+-
++######################################################################
+ &set_label("cbc_decrypt",16);
+-	&sub	($len,0x40);
++	&cmp	($len,0x50);
+ 	&jbe	(&label("cbc_dec_tail"));
+-	&jmp	(&label("cbc_dec_loop3"));
++	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
++	&sub	($len,0x50);
++	&jmp	(&label("cbc_dec_loop6_enter"));
 +
-+	movaps	$inout7,$inout0
-+	movaps	$rndkey0,$iv
-+	add	\$0x70,$len
-+	jle	.Lcbc_dec_tail_collected
-+	movups	$inout0,($out)
-+	lea	1($rnds_,$rnds_),$rounds
-+	lea	0x10($out),$out
-+.Lcbc_dec_tail:
- 	movups	($inp),$inout0
--	cmp	\$0x10,$len
- 	movaps	$inout0,$in0
-+	cmp	\$0x10,$len
- 	jbe	.Lcbc_dec_one
++&set_label("cbc_dec_loop6",16);
++	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
++	&movups	(&QWP(0,$out),$inout5);
++	&lea	($out,&DWP(0x10,$out));
++&set_label("cbc_dec_loop6_enter");
++	&movdqu	($inout0,&QWP(0,$inp));
++	&movdqu	($inout1,&QWP(0x10,$inp));
++	&movdqu	($inout2,&QWP(0x20,$inp));
++	&movdqu	($inout3,&QWP(0x30,$inp));
++	&movdqu	($inout4,&QWP(0x40,$inp));
++	&movdqu	($inout5,&QWP(0x50,$inp));
+ 
+-&set_label("cbc_dec_loop3",16);
+-	&movups	($inout0,&QWP(0,$inp));
+-	&movups	($inout1,&QWP(0x10,$inp));
+-	&movups	($inout2,&QWP(0x20,$inp));
+-	&movaps	($in0,$inout0);
+-	&movaps	($in1,$inout1);
+-	&call	("_aesni_decrypt3");
+-	&sub	($len,0x30);
+-	&lea	($inp,&DWP(0x30,$inp));
+-	&lea	($out,&DWP(0x30,$out));
+-	&pxor	($inout0,$ivec);
+-	&pxor	($inout1,$in0);
+-	&movups	($ivec,&QWP(-0x10,$inp));
+-	&pxor	($inout2,$in1);
+-	&movups	(&QWP(-0x30,$out),$inout0);
+-	&mov	($rounds,$rounds_)	# restore $rounds
+-	&movups	(&QWP(-0x20,$out),$inout1);
+-	&mov	($key,$key_);		# restore $key
+-	&movups	(&QWP(-0x10,$out),$inout2);
+-	&ja	(&label("cbc_dec_loop3"));
++	&call	("_aesni_decrypt6");
+ 
++	&movups	($rndkey1,&QWP(0,$inp));
++	&movups	($rndkey0,&QWP(0x10,$inp));
++	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
++	&xorps	($inout1,$rndkey1);
++	&movups	($rndkey1,&QWP(0x20,$inp));
++	&xorps	($inout2,$rndkey0);
++	&movups	($rndkey0,&QWP(0x30,$inp));
++	&xorps	($inout3,$rndkey1);
++	&movups	($rndkey1,&QWP(0x40,$inp));
++	&xorps	($inout4,$rndkey0);
++	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
++	&xorps	($inout5,$rndkey1);
++	&movups	(&QWP(0,$out),$inout0);
++	&movups	(&QWP(0x10,$out),$inout1);
++	&lea	($inp,&DWP(0x60,$inp));
++	&movups	(&QWP(0x20,$out),$inout2);
++	&mov	($rounds,$rounds_)		# restore $rounds
++	&movups	(&QWP(0x30,$out),$inout3);
++	&mov	($key,$key_);			# restore $key
++	&movups	(&QWP(0x40,$out),$inout4);
++	&lea	($out,&DWP(0x50,$out));
++	&sub	($len,0x60);
++	&ja	(&label("cbc_dec_loop6"));
 +
- 	movups	0x10($inp),$inout1
--	cmp	\$0x20,$len
- 	movaps	$inout1,$in1
-+	cmp	\$0x20,$len
- 	jbe	.Lcbc_dec_two
++	&movaps	($inout0,$inout5);
++	&movaps	($ivec,$rndkey0);
++	&add	($len,0x50);
++	&jle	(&label("cbc_dec_tail_collected"));
++	&movups	(&QWP(0,$out),$inout0);
++	&lea	($out,&DWP(0x10,$out));
+ &set_label("cbc_dec_tail");
+-	&add	($len,0x40);
+-	&jz	(&label("cbc_ret"));
+-
+ 	&movups	($inout0,&QWP(0,$inp));
+-	&cmp	($len,0x10);
+ 	&movaps	($in0,$inout0);
++	&cmp	($len,0x10);
+ 	&jbe	(&label("cbc_dec_one"));
 +
- 	movups	0x20($inp),$inout2
--	cmp	\$0x30,$len
- 	movaps	$inout2,$in2
-+	cmp	\$0x30,$len
- 	jbe	.Lcbc_dec_three
+ 	&movups	($inout1,&QWP(0x10,$inp));
+-	&cmp	($len,0x20);
+ 	&movaps	($in1,$inout1);
++	&cmp	($len,0x20);
+ 	&jbe	(&label("cbc_dec_two"));
 +
- 	movups	0x30($inp),$inout3
--	call	_aesni_decrypt4
--	pxor	$iv,$inout0
--	movups	0x30($inp),$iv
--	pxor	$in0,$inout1
-+	cmp	\$0x40,$len
-+	jbe	.Lcbc_dec_four
+ 	&movups	($inout2,&QWP(0x20,$inp));
+ 	&cmp	($len,0x30);
+ 	&jbe	(&label("cbc_dec_three"));
 +
-+	movups	0x40($inp),$inout4
-+	cmp	\$0x50,$len
-+	jbe	.Lcbc_dec_five
+ 	&movups	($inout3,&QWP(0x30,$inp));
+-	&call	("_aesni_decrypt4");
++	&cmp	($len,0x40);
++	&jbe	(&label("cbc_dec_four"));
 +
-+	movups	0x50($inp),$inout5
-+	cmp	\$0x60,$len
-+	jbe	.Lcbc_dec_six
++	&movups	($inout4,&QWP(0x40,$inp));
++	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
++	&movups	($inout0,&QWP(0,$inp));
++	&xorps	($inout5,$inout5);
++	&call	("_aesni_decrypt6");
++	&movups	($rndkey1,&QWP(0,$inp));
+ 	&movups	($rndkey0,&QWP(0x10,$inp));
++	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
++	&xorps	($inout1,$rndkey1);
+ 	&movups	($rndkey1,&QWP(0x20,$inp));
+-	&pxor	($inout0,$ivec);
+-	&pxor	($inout1,$in0);
+-	&movups	($ivec,&QWP(0x30,$inp));
++	&xorps	($inout2,$rndkey0);
++	&movups	($rndkey0,&QWP(0x30,$inp));
++	&xorps	($inout3,$rndkey1);
++	&movups	($ivec,&QWP(0x40,$inp));	# IV
++	&xorps	($inout4,$rndkey0);
+ 	&movups	(&QWP(0,$out),$inout0);
+-	&pxor	($inout2,$rndkey0);
+-	&pxor	($inout3,$rndkey1);
+ 	&movups	(&QWP(0x10,$out),$inout1);
+ 	&movups	(&QWP(0x20,$out),$inout2);
+-	&movaps	($inout0,$inout3);
+-	&lea	($out,&DWP(0x30,$out));
++	&movups	(&QWP(0x30,$out),$inout3);
++	&lea	($out,&DWP(0x40,$out));
++	&movaps	($inout0,$inout4);
++	&sub	($len,0x50);
+ 	&jmp	(&label("cbc_dec_tail_collected"));
+ 
+-&set_label("cbc_dec_one");
+-	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt3");
+-	&pxor	($inout0,$ivec);
++&set_label("cbc_dec_one",16);
++	if ($inline)
++	{   &aesni_inline_generate1("dec");	}
++	else
++	{   &call	("_aesni_decrypt1");	}
++	&xorps	($inout0,$ivec);
+ 	&movaps	($ivec,$in0);
++	&sub	($len,0x10);
+ 	&jmp	(&label("cbc_dec_tail_collected"));
+ 
+-&set_label("cbc_dec_two");
++&set_label("cbc_dec_two",16);
++	&xorps	($inout2,$inout2);
+ 	&call	("_aesni_decrypt3");
+-	&pxor	($inout0,$ivec);
+-	&pxor	($inout1,$in0);
++	&xorps	($inout0,$ivec);
++	&xorps	($inout1,$in0);
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&movaps	($inout0,$inout1);
+-	&movaps	($ivec,$in1);
+ 	&lea	($out,&DWP(0x10,$out));
++	&movaps	($ivec,$in1);
++	&sub	($len,0x20);
+ 	&jmp	(&label("cbc_dec_tail_collected"));
+ 
+-&set_label("cbc_dec_three");
++&set_label("cbc_dec_three",16);
+ 	&call	("_aesni_decrypt3");
+-	&pxor	($inout0,$ivec);
+-	&pxor	($inout1,$in0);
+-	&pxor	($inout2,$in1);
++	&xorps	($inout0,$ivec);
++	&xorps	($inout1,$in0);
++	&xorps	($inout2,$in1);
+ 	&movups	(&QWP(0,$out),$inout0);
+-	&movups	(&QWP(0x10,$out),$inout1);
+ 	&movaps	($inout0,$inout2);
+-	&movups	($ivec,&QWP(0x20,$inp));
++	&movups	(&QWP(0x10,$out),$inout1);
+ 	&lea	($out,&DWP(0x20,$out));
++	&movups	($ivec,&QWP(0x20,$inp));
++	&sub	($len,0x30);
++	&jmp	(&label("cbc_dec_tail_collected"));
 +
-+	movups	0x60($inp),$inout6
-+	movaps	$iv,$reserved(%rsp)	# save IV
-+	call	_aesni_decrypt8
-+	movups	($inp),$rndkey1
-+	movups	0x10($inp),$rndkey0
-+	xorps	$reserved(%rsp),$inout0	# ^= IV
-+	xorps	$rndkey1,$inout1
-+	movups	0x20($inp),$rndkey1
-+	xorps	$rndkey0,$inout2
-+	movups	0x30($inp),$rndkey0
-+	xorps	$rndkey1,$inout3
-+	movups	0x40($inp),$rndkey1
-+	xorps	$rndkey0,$inout4
-+	movups	0x50($inp),$rndkey0
-+	xorps	$rndkey1,$inout5
-+	movups	0x60($inp),$iv		# IV
-+	xorps	$rndkey0,$inout6
- 	movups	$inout0,($out)
--	pxor	$in1,$inout2
- 	movups	$inout1,0x10($out)
--	pxor	$in2,$inout3
- 	movups	$inout2,0x20($out)
--	movaps	$inout3,$inout0
--	lea	0x30($out),$out
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	movups	$inout5,0x50($out)
-+	lea	0x60($out),$out
-+	movaps	$inout6,$inout0
-+	sub	\$0x70,$len
- 	jmp	.Lcbc_dec_tail_collected
- .align	16
- .Lcbc_dec_one:
- ___
- 	&aesni_generate1("dec",$key,$rounds);
- $code.=<<___;
--	pxor	$iv,$inout0
-+	xorps	$iv,$inout0
- 	movaps	$in0,$iv
-+	sub	\$0x10,$len
- 	jmp	.Lcbc_dec_tail_collected
- .align	16
- .Lcbc_dec_two:
-+	xorps	$inout2,$inout2
- 	call	_aesni_decrypt3
--	pxor	$iv,$inout0
--	pxor	$in0,$inout1
-+	xorps	$iv,$inout0
-+	xorps	$in0,$inout1
- 	movups	$inout0,($out)
- 	movaps	$in1,$iv
- 	movaps	$inout1,$inout0
- 	lea	0x10($out),$out
-+	sub	\$0x20,$len
- 	jmp	.Lcbc_dec_tail_collected
- .align	16
- .Lcbc_dec_three:
- 	call	_aesni_decrypt3
--	pxor	$iv,$inout0
--	pxor	$in0,$inout1
-+	xorps	$iv,$inout0
-+	xorps	$in0,$inout1
- 	movups	$inout0,($out)
--	pxor	$in1,$inout2
-+	xorps	$in1,$inout2
- 	movups	$inout1,0x10($out)
- 	movaps	$in2,$iv
- 	movaps	$inout2,$inout0
- 	lea	0x20($out),$out
-+	sub	\$0x30,$len
-+	jmp	.Lcbc_dec_tail_collected
-+.align	16
-+.Lcbc_dec_four:
-+	call	_aesni_decrypt4
-+	xorps	$iv,$inout0
-+	movups	0x30($inp),$iv
-+	xorps	$in0,$inout1
-+	movups	$inout0,($out)
-+	xorps	$in1,$inout2
-+	movups	$inout1,0x10($out)
-+	xorps	$in2,$inout3
-+	movups	$inout2,0x20($out)
-+	movaps	$inout3,$inout0
-+	lea	0x30($out),$out
-+	sub	\$0x40,$len
-+	jmp	.Lcbc_dec_tail_collected
-+.align	16
-+.Lcbc_dec_five:
-+	xorps	$inout5,$inout5
-+	call	_aesni_decrypt6
-+	movups	0x10($inp),$rndkey1
-+	movups	0x20($inp),$rndkey0
-+	xorps	$iv,$inout0
-+	xorps	$in0,$inout1
-+	xorps	$rndkey1,$inout2
-+	movups	0x30($inp),$rndkey1
-+	xorps	$rndkey0,$inout3
-+	movups	0x40($inp),$iv
-+	xorps	$rndkey1,$inout4
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	lea	0x40($out),$out
-+	movaps	$inout4,$inout0
-+	sub	\$0x50,$len
-+	jmp	.Lcbc_dec_tail_collected
-+.align	16
-+.Lcbc_dec_six:
-+	call	_aesni_decrypt6
-+	movups	0x10($inp),$rndkey1
-+	movups	0x20($inp),$rndkey0
-+	xorps	$iv,$inout0
-+	xorps	$in0,$inout1
-+	xorps	$rndkey1,$inout2
-+	movups	0x30($inp),$rndkey1
-+	xorps	$rndkey0,$inout3
-+	movups	0x40($inp),$rndkey0
-+	xorps	$rndkey1,$inout4
-+	movups	0x50($inp),$iv
-+	xorps	$rndkey0,$inout5
-+	movups	$inout0,($out)
-+	movups	$inout1,0x10($out)
-+	movups	$inout2,0x20($out)
-+	movups	$inout3,0x30($out)
-+	movups	$inout4,0x40($out)
-+	lea	0x50($out),$out
-+	movaps	$inout5,$inout0
-+	sub	\$0x60,$len
- 	jmp	.Lcbc_dec_tail_collected
- .align	16
- .Lcbc_dec_tail_collected:
-@@ -523,10 +1114,12 @@ $code.=<<___;
- 	jnz	.Lcbc_dec_tail_partial
- 	movups	$inout0,($out)
- 	jmp	.Lcbc_dec_ret
-+.align	16
- .Lcbc_dec_tail_partial:
- 	movaps	$inout0,$reserved(%rsp)
-+	mov	\$16,%rcx
- 	mov	$out,%rdi
--	mov	$len,%rcx
-+	sub	$len,%rcx
- 	lea	$reserved(%rsp),%rsi
- 	.long	0x9066A4F3	# rep movsb
++&set_label("cbc_dec_four",16);
++	&call	("_aesni_decrypt4");
++	&movups	($rndkey1,&QWP(0x10,$inp));
++	&movups	($rndkey0,&QWP(0x20,$inp));
++	&xorps	($inout0,$ivec);
++	&movups	($ivec,&QWP(0x30,$inp));
++	&xorps	($inout1,$in0);
++	&movups	(&QWP(0,$out),$inout0);
++	&xorps	($inout2,$rndkey1);
++	&movups	(&QWP(0x10,$out),$inout1);
++	&xorps	($inout3,$rndkey0);
++	&movups	(&QWP(0x20,$out),$inout2);
++	&lea	($out,&DWP(0x30,$out));
++	&movaps	($inout0,$inout3);
++	&sub	($len,0x40);
  
-@@ -544,7 +1137,7 @@ $code.=<<___;
- 	ret
- .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
- ___
--
-+} 
- # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
- #				int bits, AES_KEY *key)
- { my ($inp,$bits,$key) = @_4args;
-@@ -556,7 +1149,7 @@ $code.=<<___;
- .align	16
- ${PREFIX}_set_decrypt_key:
- 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
--	call	_aesni_set_encrypt_key
-+	call	__aesni_set_encrypt_key
- 	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
- 	test	%eax,%eax
- 	jnz	.Ldec_key_ret
-@@ -576,9 +1169,9 @@ ${PREFIX}_set_decrypt_key:
- 	aesimc	%xmm1,%xmm1
- 	lea	16($key),$key
- 	lea	-16($inp),$inp
--	cmp	$key,$inp
- 	$movkey	%xmm0,16($inp)
- 	$movkey	%xmm1,-16($key)
-+	cmp	$key,$inp
- 	ja	.Ldec_key_inverse
+ &set_label("cbc_dec_tail_collected");
+ 	&and	($len,15);
+@@ -506,21 +779,21 @@ if ($PREFIX eq "aesni") {
+ 	&movups	(&QWP(0,$out),$inout0);
+ 	&jmp	(&label("cbc_ret"));
  
- 	$movkey	($key),%xmm0		# inverse middle
-@@ -605,16 +1198,16 @@ $code.=<<___;
- .type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
- .align	16
- ${PREFIX}_set_encrypt_key:
--_aesni_set_encrypt_key:
-+__aesni_set_encrypt_key:
- 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
--	test	$inp,$inp
- 	mov	\$-1,%rax
-+	test	$inp,$inp
- 	jz	.Lenc_key_ret
- 	test	$key,$key
- 	jz	.Lenc_key_ret
+-&set_label("cbc_dec_tail_partial");
+-	&mov	($key_,"esp");
+-	&sub	("esp",16);
+-	&and	("esp",-16);
++&set_label("cbc_dec_tail_partial",16);
+ 	&movaps	(&QWP(0,"esp"),$inout0);
++	&mov	("ecx",16);
+ 	&mov	($inp,"esp");
+-	&mov	("ecx",$len);
++	&sub	("ecx",$len);
+ 	&data_word(0xA4F3F689);		# rep movsb
+-	&mov	("esp",$key_);
+ 
+ &set_label("cbc_ret");
++	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
+ 	&mov	($key_,&wparam(4));
+ 	&movups	(&QWP(0,$key_),$ivec);	# output IV
++&set_label("cbc_abort");
+ &function_end("${PREFIX}_cbc_encrypt");
+-
++
++######################################################################
+ # Mechanical port from aesni-x86_64.pl.
+ #
+ # _aesni_set_encrypt_key is private interface,
+@@ -539,7 +812,7 @@ if ($PREFIX eq "aesni") {
+ 	&jz	(&label("bad_pointer"));
+ 
+ 	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+-	&pxor	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
++	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+ 	&lea	($key,&DWP(16,$key));
+ 	&cmp	($rounds,256);
+ 	&je	(&label("14rounds"));
+@@ -581,11 +854,11 @@ if ($PREFIX eq "aesni") {
+ 	&lea		($key,&DWP(16,$key));
+ &set_label("key_128_cold");
+ 	&shufps		("xmm4","xmm0",0b00010000);
+-	&pxor		("xmm0","xmm4");
+-	&shufps		("xmm4","xmm0",0b10001100,);
+-	&pxor		("xmm0","xmm4");
+-	&pshufd		("xmm1","xmm1",0b11111111);	# critical path
+-	&pxor		("xmm0","xmm1");
++	&xorps		("xmm0","xmm4");
++	&shufps		("xmm4","xmm0",0b10001100);
++	&xorps		("xmm0","xmm4");
++	&shufps		("xmm1","xmm1",0b11111111);	# critical path
++	&xorps		("xmm0","xmm1");
+ 	&ret();
  
- 	movups	($inp),%xmm0		# pull first 128 bits of *userKey
--	pxor	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
-+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
- 	lea	16($key),%rax
- 	cmp	\$256,$bits
- 	je	.L14rounds
-@@ -729,11 +1322,11 @@ _aesni_set_encrypt_key:
- 	lea	16(%rax),%rax
- .Lkey_expansion_128_cold:
- 	shufps	\$0b00010000,%xmm0,%xmm4
--	pxor	%xmm4, %xmm0
-+	xorps	%xmm4, %xmm0
- 	shufps	\$0b10001100,%xmm0,%xmm4
--	pxor	%xmm4, %xmm0
--	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
--	pxor	%xmm1,%xmm0
-+	xorps	%xmm4, %xmm0
-+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
-+	xorps	%xmm1,%xmm0
- 	ret
+ &set_label("12rounds",16);
+@@ -620,11 +893,11 @@ if ($PREFIX eq "aesni") {
+ 	&movaps		("xmm5","xmm2");
+ &set_label("key_192b_warm");
+ 	&shufps		("xmm4","xmm0",0b00010000);
+-	&movaps		("xmm3","xmm2");
+-	&pxor		("xmm0","xmm4");
++	&movdqa		("xmm3","xmm2");
++	&xorps		("xmm0","xmm4");
+ 	&shufps		("xmm4","xmm0",0b10001100);
+ 	&pslldq		("xmm3",4);
+-	&pxor		("xmm0","xmm4");
++	&xorps		("xmm0","xmm4");
+ 	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
+ 	&pxor		("xmm2","xmm3");
+ 	&pxor		("xmm0","xmm1");
+@@ -683,11 +956,11 @@ if ($PREFIX eq "aesni") {
+ 	&lea		($key,&DWP(16,$key));
+ &set_label("key_256a_cold");
+ 	&shufps		("xmm4","xmm0",0b00010000);
+-	&pxor		("xmm0","xmm4");
++	&xorps		("xmm0","xmm4");
+ 	&shufps		("xmm4","xmm0",0b10001100);
+-	&pxor		("xmm0","xmm4");
+-	&pshufd		("xmm1","xmm1",0b11111111);	# critical path
+-	&pxor		("xmm0","xmm1");
++	&xorps		("xmm0","xmm4");
++	&shufps		("xmm1","xmm1",0b11111111);	# critical path
++	&xorps		("xmm0","xmm1");
+ 	&ret();
  
- .align 16
-@@ -744,11 +1337,11 @@ _aesni_set_encrypt_key:
- 	movaps	%xmm2, %xmm5
- .Lkey_expansion_192b_warm:
- 	shufps	\$0b00010000,%xmm0,%xmm4
--	movaps	%xmm2,%xmm3
--	pxor	%xmm4,%xmm0
-+	movdqa	%xmm2,%xmm3
-+	xorps	%xmm4,%xmm0
- 	shufps	\$0b10001100,%xmm0,%xmm4
- 	pslldq	\$4,%xmm3
--	pxor	%xmm4,%xmm0
-+	xorps	%xmm4,%xmm0
- 	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
- 	pxor	%xmm3,%xmm2
- 	pxor	%xmm1,%xmm0
-@@ -772,11 +1365,11 @@ _aesni_set_encrypt_key:
- 	lea	16(%rax),%rax
- .Lkey_expansion_256a_cold:
- 	shufps	\$0b00010000,%xmm0,%xmm4
--	pxor	%xmm4,%xmm0
-+	xorps	%xmm4,%xmm0
- 	shufps	\$0b10001100,%xmm0,%xmm4
--	pxor	%xmm4,%xmm0
--	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
--	pxor	%xmm1,%xmm0
-+	xorps	%xmm4,%xmm0
-+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
-+	xorps	%xmm1,%xmm0
- 	ret
+ &set_label("key_256b",16);
+@@ -695,11 +968,11 @@ if ($PREFIX eq "aesni") {
+ 	&lea		($key,&DWP(16,$key));
  
- .align 16
-@@ -785,17 +1378,28 @@ _aesni_set_encrypt_key:
- 	lea	16(%rax),%rax
+ 	&shufps		("xmm4","xmm2",0b00010000);
+-	&pxor		("xmm2","xmm4");
++	&xorps		("xmm2","xmm4");
+ 	&shufps		("xmm4","xmm2",0b10001100);
+-	&pxor		("xmm2","xmm4");
+-	&pshufd		("xmm1","xmm1",0b10101010);	# critical path
+-	&pxor		("xmm2","xmm1");
++	&xorps		("xmm2","xmm4");
++	&shufps		("xmm1","xmm1",0b10101010);	# critical path
++	&xorps		("xmm2","xmm1");
+ 	&ret();
  
- 	shufps	\$0b00010000,%xmm2,%xmm4
--	pxor	%xmm4,%xmm2
-+	xorps	%xmm4,%xmm2
- 	shufps	\$0b10001100,%xmm2,%xmm4
--	pxor	%xmm4,%xmm2
--	pshufd	\$0b10101010,%xmm1,%xmm1	# critical path
--	pxor	%xmm1,%xmm2
-+	xorps	%xmm4,%xmm2
-+	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
-+	xorps	%xmm1,%xmm2
- 	ret
- .size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
-+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
- ___
- }
- 
- $code.=<<___;
-+.align	64
-+.Lbswap_mask:
-+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-+.Lincrement32:
-+	.long	6,6,6,0
-+.Lincrement64:
-+	.long	1,0,0,0
-+.Lxts_magic:
-+	.long	0x87,0,1,0
-+
- .asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
- .align	64
- ___
-diff -up openssl-1.0.0d/crypto/cryptlib.c.intelopts openssl-1.0.0d/crypto/cryptlib.c
---- openssl-1.0.0d/crypto/cryptlib.c.intelopts	2010-11-19 01:11:27.000000000 +0100
-+++ openssl-1.0.0d/crypto/cryptlib.c	2011-08-24 12:50:55.000000000 +0200
+ &set_label("bad_pointer",4);
+@@ -747,9 +1020,9 @@ if ($PREFIX eq "aesni") {
+ 	&aesimc		("xmm1","xmm1");
+ 	&lea		($key,&DWP(16,$key));
+ 	&lea		("eax",&DWP(-16,"eax"));
+-	&cmp		("eax",$key);
+ 	&$movekey	(&QWP(16,"eax"),"xmm0");
+ 	&$movekey	(&QWP(-16,$key),"xmm1");
++	&cmp		("eax",$key);
+ 	&ja		(&label("dec_key_inverse"));
+ 
+ 	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
+diff -up openssl-1.0.0k/crypto/cryptlib.c.intelopts openssl-1.0.0k/crypto/cryptlib.c
+--- openssl-1.0.0k/crypto/cryptlib.c.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/cryptlib.c	2013-02-19 21:15:39.596407392 +0100
 @@ -662,22 +662,23 @@ const char *CRYPTO_get_lock_name(int typ
  	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
  
@@ -2343,158 +2343,57 @@ diff -up openssl-1.0.0d/crypto/cryptlib.c.intelopts openssl-1.0.0d/crypto/cryptl
      /*
       * |(1<<10) sets a reserved bit to signal that variable
       * was initialized already... This is to avoid interference
-diff -up openssl-1.0.0d/crypto/engine/eng_aesni.c.intelopts openssl-1.0.0d/crypto/engine/eng_aesni.c
---- openssl-1.0.0d/crypto/engine/eng_aesni.c.intelopts	2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/engine/eng_aesni.c	2011-08-24 12:50:55.000000000 +0200
+diff -up openssl-1.0.0k/crypto/engine/eng_aesni.c.intelopts openssl-1.0.0k/crypto/engine/eng_aesni.c
+--- openssl-1.0.0k/crypto/engine/eng_aesni.c.intelopts	2013-02-19 21:15:39.419403774 +0100
++++ openssl-1.0.0k/crypto/engine/eng_aesni.c	2013-02-19 21:15:39.608407632 +0100
 @@ -157,16 +157,20 @@ typedef unsigned __int64 IA32CAP;
  typedef unsigned long long IA32CAP;
  #endif
  
-+extern IA32CAP OPENSSL_ia32cap_X;
-+
- /* Prepare the ENGINE structure for registration */
- static int
- aesni_bind_helper(ENGINE *e)
- {
- 	int engage;
--	if (sizeof(OPENSSL_ia32cap_P) > 4) {
--		engage = (OPENSSL_ia32cap_P >> 57) & 1;
--	} else {
--		IA32CAP OPENSSL_ia32_cpuid(void);
--		engage = (OPENSSL_ia32_cpuid() >> 57) & 1;
-+	engage = (OPENSSL_ia32cap_X >> 57) & 1;
-+
-+	/* Disable the AES-NI support if the environment variable
-+	 * OPENSSL_DISABLE_AES_NI is set to any value
-+	 */
-+	if (getenv("OPENSSL_DISABLE_AES_NI") != NULL) {
-+		engage = 0;
- 	}
- 
- 	/* Register everything or return with an error */
-diff -up openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c.intelopts openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c
---- openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c.intelopts	2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c	2011-08-24 12:50:55.000000000 +0200
-@@ -62,6 +62,8 @@ void OPENSSL_cleanse(void *p,size_t len)
- 
- #ifdef OPENSSL_FIPS
- 
-+unsigned long long OPENSSL_ia32cap_X = 0;
-+
- static void hmac_init(SHA256_CTX *md_ctx,SHA256_CTX *o_ctx,
- 		      const char *key)
-     {
-diff -up openssl-1.0.0d/crypto/perlasm/x86asm.pl.intelopts openssl-1.0.0d/crypto/perlasm/x86asm.pl
---- openssl-1.0.0d/crypto/perlasm/x86asm.pl.intelopts	2008-12-17 20:56:47.000000000 +0100
-+++ openssl-1.0.0d/crypto/perlasm/x86asm.pl	2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
- 
- # require 'x86asm.pl';
- # &asm_init(<flavor>,"des-586.pl"[,$i386only]);
-@@ -80,6 +80,57 @@ sub ::movq
-     {	&::generic("movq", at _);			}
- }
- 
-+# SSE>2 instructions
-+my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
-+		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
-+sub ::pextrd
-+{ my($dst,$src,$imm)=@_;
-+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
-+    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
-+    else
-+    {	&::generic("pextrd", at _);		}
-+}
-+
-+sub ::pinsrd
-+{ my($dst,$src,$imm)=@_;
-+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
-+    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
-+    else
-+    {	&::generic("pinsrd", at _);		}
-+}
-+
-+sub ::pshufb
-+{ my($dst,$src)=@_;
-+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
-+    else
-+    {	&::generic("pshufb", at _);		}
-+}
-+
-+sub ::palignr
-+{ my($dst,$src,$imm)=@_;
-+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
-+    else
-+    {	&::generic("palignr", at _);		}
-+}
-+
-+sub ::pclmulqdq
-+{ my($dst,$src,$imm)=@_;
-+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
-+    else
-+    {	&::generic("pclmulqdq", at _);		}
-+}
-+
-+sub ::rdrand
-+{ my ($dst)=@_;
-+    if ($dst =~ /(e[a-dsd][ixp])/)
-+    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
-+    else
-+    {	&::generic("rdrand", at _);	}
-+}
-+
- # label management
- $lbdecor="L";		# local label decoration, set by package
- $label="000";
-diff -up openssl-1.0.0d/crypto/perlasm/x86gas.pl.intelopts openssl-1.0.0d/crypto/perlasm/x86gas.pl
---- openssl-1.0.0d/crypto/perlasm/x86gas.pl.intelopts	2008-12-17 20:56:47.000000000 +0100
-+++ openssl-1.0.0d/crypto/perlasm/x86gas.pl	2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
- 
- package x86gas;
- 
-@@ -91,6 +91,7 @@ sub ::DWP
- }
- sub ::QWP	{ &::DWP(@_);	}
- sub ::BP	{ &::DWP(@_);	}
-+sub ::WP	{ &::DWP(@_);	}
- sub ::BC	{ @_;		}
- sub ::DWC	{ @_;		}
- 
-@@ -161,10 +162,16 @@ sub ::file_end
- 	    {	push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
++extern IA32CAP OPENSSL_ia32cap_X;
++
+ /* Prepare the ENGINE structure for registration */
+ static int
+ aesni_bind_helper(ENGINE *e)
+ {
+ 	int engage;
+-	if (sizeof(OPENSSL_ia32cap_P) > 4) {
+-		engage = (OPENSSL_ia32cap_P >> 57) & 1;
+-	} else {
+-		IA32CAP OPENSSL_ia32_cpuid(void);
+-		engage = (OPENSSL_ia32_cpuid() >> 57) & 1;
++	engage = (OPENSSL_ia32cap_X >> 57) & 1;
++
++	/* Disable the AES-NI support if the environment variable
++	 * OPENSSL_DISABLE_AES_NI is set to any value
++	 */
++	if (getenv("OPENSSL_DISABLE_AES_NI") != NULL) {
++		engage = 0;
  	}
-     }
-+    if (grep {/\b${nmdecor}OPENSSL_ia32cap_X\b/i} @out) {
-+	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_X,8";
-+	if ($::elf)	{ push (@out,"$tmp,4\n"); }
-+	else		{ push (@out,"$tmp\n"); }
-+    }
-     push(@out,$initseg) if ($initseg);
- }
  
- sub ::data_byte	{   push(@out,".byte\t".join(',', at _)."\n");   }
-+sub ::data_short{   push(@out,".value\t".join(',', at _)."\n");  }
- sub ::data_word {   push(@out,".long\t".join(',', at _)."\n");   }
+ 	/* Register everything or return with an error */
+diff -up openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.intelopts openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c
+--- openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.intelopts	2013-02-19 21:15:39.373402833 +0100
++++ openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c	2013-02-19 21:15:39.608407632 +0100
+@@ -62,6 +62,8 @@ void OPENSSL_cleanse(void *p,size_t len)
  
- sub ::align
-diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl
---- openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts	2010-10-10 23:14:17.000000000 +0200
-+++ openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl	2011-08-24 12:50:56.000000000 +0200
+ #ifdef OPENSSL_FIPS
+ 
++unsigned long long OPENSSL_ia32cap_X = 0;
++
+ static void hmac_init(SHA256_CTX *md_ctx,SHA256_CTX *o_ctx,
+ 		      const char *key)
+     {
+diff -up openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl
+--- openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl	2013-02-19 21:15:39.619407858 +0100
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env perl
 +#!/usr/bin/perl
  
  # Ascetic x86_64 AT&T to MASM/NASM assembler translator by <appro>.
  #
-@@ -121,7 +121,11 @@ my %globals;
+@@ -117,7 +117,11 @@ my %globals;
  		$self->{sz} = "b";
  	    } elsif ($self->{op} =~ /call|jmp/) {
  		$self->{sz} = "";
@@ -2507,7 +2406,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
  		$self->{sz} = "";
  	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
  		$self->{op} = $1;
-@@ -246,35 +250,38 @@ my %globals;
+@@ -242,35 +246,38 @@ my %globals;
  	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
  	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
  
@@ -2558,7 +2457,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
  	    } elsif ($self->{base} eq "rip") {
  		sprintf "%s[%s]",$szmap{$sz},$self->{label};
  	    } else {
-@@ -506,6 +513,11 @@ my %globals;
+@@ -502,6 +509,11 @@ my %globals;
  		    }
  		} elsif ($dir =~ /\.(text|data)/) {
  		    $current_segment=".$1";
@@ -2570,7 +2469,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
  		}
  		$line = "";
  		return $self;
-@@ -613,6 +625,19 @@ my %globals;
+@@ -610,6 +622,19 @@ my %globals;
  						.join(",", at str) if (@str);
  				    last;
  				  };
@@ -2590,7 +2489,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
  	    }
  	    $line = "";
  	}
-@@ -625,9 +650,133 @@ my %globals;
+@@ -622,9 +647,133 @@ my %globals;
      }
  }
  
@@ -2693,544 +2592,197 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
 +my $pclmulqdq = sub {
 +    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
 +      my @opcode=(0x66);
-+	rex(\@opcode,$3,$2);
-+	push @opcode,0x0f,0x3a,0x44;
-+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-+	my $c=$1;
-+	push @opcode,$c=~/^0/?oct($c):$c;
-+	@opcode;
-+    } else {
-+	();
-+    }
-+};
-+
-+my $rdrand = sub {
-+    if (shift =~ /%[er](\w+)/) {
-+      my @opcode=();
-+      my $dst=$1;
-+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-+	rex(\@opcode,0,$1,8);
-+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
-+	@opcode;
-+    } else {
-+	();
-+    }
-+};
-+
- if ($nasm) {
-     print <<___;
- default	rel
-+%define XMMWORD
- ___
- } elsif ($masm) {
-     print <<___;
-@@ -644,14 +793,22 @@ while($line=<>) {
- 
-     undef $label;
-     undef $opcode;
--    undef $sz;
-     undef @args;
- 
-     if ($label=label->re(\$line))	{ print $label->out(); }
- 
-     if (directive->re(\$line)) {
- 	printf "%s",directive->out();
--    } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
-+    } elsif ($opcode=opcode->re(\$line)) {
-+	my $asm = eval("\$".$opcode->mnemonic());
-+	undef @bytes;
-+	
-+	if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
-+	    print $gas?".byte\t":"DB\t",join(',', at bytes),"\n";
-+	    next;
-+	}
-+
-+	ARGUMENT: while (1) {
- 	my $arg;
- 
- 	if ($arg=register->re(\$line))	{ opcode->size($arg->size()); }
-@@ -667,19 +824,26 @@ while($line=<>) {
- 	$line =~ s/^,\s*//;
- 	} # ARGUMENT:
- 
--	$sz=opcode->size();
--
- 	if ($#args>=0) {
- 	    my $insn;
-+	    my $sz=opcode->size();
-+
- 	    if ($gas) {
- 		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
-+		@args = map($_->out($sz), at args);
-+		printf "\t%s\t%s",$insn,join(",", at args);
- 	    } else {
- 		$insn = $opcode->out();
--		$insn .= $sz if (map($_->out() =~ /x?mm/, at args));
-+		foreach (@args) {
-+		    my $arg = $_->out();
-+		    # $insn.=$sz compensates for movq, pinsrw, ...
-+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
-+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
-+		}
- 		@args = reverse(@args);
- 		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
-+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
- 	    }
--	    printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
- 	} else {
- 	    printf "\t%s",$opcode->out();
- 	}
-diff -up openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl.intelopts openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl
---- openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl.intelopts	2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl	2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
- #
- # ====================================================================
- # Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -7,6 +7,8 @@
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
-+# July 2004
-+#
- # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
- # "hand-coded assembler"] doesn't stand for the whole improvement
- # coefficient. It turned out that eliminating RC4_CHAR from config
-@@ -19,6 +21,8 @@
- # to operate on partial registers, it turned out to be the best bet.
- # At least for AMD... How IA32E would perform remains to be seen...
- 
-+# November 2004
-+#
- # As was shown by Marc Bevand reordering of couple of load operations
- # results in even higher performance gain of 3.3x:-) At least on
- # Opteron... For reference, 1x in this case is RC4_CHAR C-code
-@@ -26,6 +30,8 @@
- # Latter means that if you want to *estimate* what to expect from
- # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
- 
-+# November 2004
-+#
- # Intel P4 EM64T core was found to run the AMD64 code really slow...
- # The only way to achieve comparable performance on P4 was to keep
- # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
-@@ -33,10 +39,14 @@
- # on either AMD and Intel platforms, I implement both cases. See
- # rc4_skey.c for further details...
- 
-+# April 2005
-+#
- # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
- # those with add/sub results in 50% performance improvement of folded
- # loop...
- 
-+# May 2005
-+#
- # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
- # performance by >30% [unlike P4 32-bit case that is]. But this is
- # provided that loads are reordered even more aggressively! Both code
-@@ -50,6 +60,8 @@
- # is not implemented, then this final RC4_CHAR code-path should be
- # preferred, as it provides better *all-round* performance].
- 
-+# March 2007
-+#
- # Intel Core2 was observed to perform poorly on both code paths:-( It
- # apparently suffers from some kind of partial register stall, which
- # occurs in 64-bit mode only [as virtually identical 32-bit loop was
-@@ -58,6 +70,34 @@
- # fit for Core2 and therefore the code was modified to skip cloop8 on
- # this CPU.
- 
-+# May 2010
-+#
-+# Intel Westmere was observed to perform suboptimally. Adding yet
-+# another movzb to cloop1 improved performance by almost 50%! Core2
-+# performance is improved too, but nominally...
-+
-+# May 2011
-+#
-+# The only code path that was not modified is P4-specific one. Non-P4
-+# Intel code path optimization is heavily based on submission by Maxim
-+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
-+# some of the ideas even in attempt to optmize the original RC4_INT
-+# code path... Current performance in cycles per processed byte (less
-+# is better) and improvement coefficients relative to previous
-+# version of this module are:
-+#
-+# Opteron	5.3/+0%
-+# P4		6.5
-+# Core2		6.2/+15%(*)
-+# Westmere	4.2/+60%
-+# Sandy Bridge	4.2/+120%
-+# Atom		9.3/+80%
-+#
-+# (*)	Note that Core2 result is ~15% lower than corresponding result
-+#	for 32-bit code, meaning that it's possible to improve it,
-+#	but more than likely at the cost of the others (see rc4-586.pl
-+#	to get the idea)...
-+
- $flavour = shift;
- $output  = shift;
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-@@ -76,13 +116,10 @@ $len="%rsi";	    # arg2
- $inp="%rdx";	    # arg3
- $out="%rcx";	    # arg4
- 
-- at XX=("%r8","%r10");
-- at TX=("%r9","%r11");
--$YY="%r12";
--$TY="%r13";
--
-+{
- $code=<<___;
- .text
-+.extern	OPENSSL_ia32cap_P
- 
- .globl	RC4
- .type	RC4,\@function,4
-@@ -95,48 +132,173 @@ RC4:	or	$len,$len
- 	push	%r12
- 	push	%r13
- .Lprologue:
-+	mov	$len,%r11
-+	mov	$inp,%r12
-+	mov	$out,%r13
-+___
-+my $len="%r11";		# reassign input arguments
-+my $inp="%r12";
-+my $out="%r13";
-+
-+my @XX=("%r10","%rsi");
-+my @TX=("%rax","%rbx");
-+my $YY="%rcx";
-+my $TY="%rdx";
- 
--	add	\$8,$dat
--	movl	-8($dat),$XX[0]#d
--	movl	-4($dat),$YY#d
-+$code.=<<___;
-+	xor	$XX[0],$XX[0]
-+	xor	$YY,$YY
++	rex(\@opcode,$3,$2);
++	push @opcode,0x0f,0x3a,0x44;
++	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
++	my $c=$1;
++	push @opcode,$c=~/^0/?oct($c):$c;
++	@opcode;
++    } else {
++	();
++    }
++};
 +
-+	lea	8($dat),$dat
-+	mov	-8($dat),$XX[0]#b
-+	mov	-4($dat),$YY#b
- 	cmpl	\$-1,256($dat)
- 	je	.LRC4_CHAR
-+	mov	OPENSSL_ia32cap_P(%rip),%r8d
-+	xor	$TX[1],$TX[1]
- 	inc	$XX[0]#b
-+	sub	$XX[0],$TX[1]
-+	sub	$inp,$out
- 	movl	($dat,$XX[0],4),$TX[0]#d
--	test	\$-8,$len
-+	test	\$-16,$len
- 	jz	.Lloop1
--	jmp	.Lloop8
-+	bt	\$30,%r8d	# Intel CPU?
-+	jc	.Lintel
-+	and	\$7,$TX[1]
-+	lea	1($XX[0]),$XX[1]
-+	jz	.Loop8
-+	sub	$TX[1],$len
-+.Loop8_warmup:
-+	add	$TX[0]#b,$YY#b
-+	movl	($dat,$YY,4),$TY#d
-+	movl	$TX[0]#d,($dat,$YY,4)
-+	movl	$TY#d,($dat,$XX[0],4)
-+	add	$TY#b,$TX[0]#b
-+	inc	$XX[0]#b
-+	movl	($dat,$TX[0],4),$TY#d
-+	movl	($dat,$XX[0],4),$TX[0]#d
-+	xorb	($inp),$TY#b
-+	movb	$TY#b,($out,$inp)
-+	lea	1($inp),$inp
-+	dec	$TX[1]
-+	jnz	.Loop8_warmup
++my $rdrand = sub {
++    if (shift =~ /%[er](\w+)/) {
++      my @opcode=();
++      my $dst=$1;
++	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
++	rex(\@opcode,0,$1,8);
++	push @opcode,0x0f,0xc7,0xf0|($dst&7);
++	@opcode;
++    } else {
++	();
++    }
++};
 +
-+	lea	1($XX[0]),$XX[1]
-+	jmp	.Loop8
- .align	16
--.Lloop8:
-+.Loop8:
- ___
- for ($i=0;$i<8;$i++) {
-+$code.=<<___ if ($i==7);
-+	add	\$8,$XX[1]#b
-+___
- $code.=<<___;
- 	add	$TX[0]#b,$YY#b
--	mov	$XX[0],$XX[1]
- 	movl	($dat,$YY,4),$TY#d
--	ror	\$8,%rax			# ror is redundant when $i=0
--	inc	$XX[1]#b
--	movl	($dat,$XX[1],4),$TX[1]#d
--	cmp	$XX[1],$YY
- 	movl	$TX[0]#d,($dat,$YY,4)
--	cmove	$TX[0],$TX[1]
--	movl	$TY#d,($dat,$XX[0],4)
-+	movl	`4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
-+	ror	\$8,%r8				# ror is redundant when $i=0
-+	movl	$TY#d,4*$i($dat,$XX[0],4)
- 	add	$TX[0]#b,$TY#b
--	movb	($dat,$TY,4),%al
-+	movb	($dat,$TY,4),%r8b
+ if ($nasm) {
+     print <<___;
+ default	rel
++%define XMMWORD
  ___
--push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
-+push(@TX,shift(@TX)); #push(@XX,shift(@XX));	# "rotate" registers
- }
- $code.=<<___;
--	ror	\$8,%rax
-+	add	\$8,$XX[0]#b
-+	ror	\$8,%r8
- 	sub	\$8,$len
+ } elsif ($masm) {
+     print <<___;
+@@ -641,14 +790,22 @@ while($line=<>) {
  
--	xor	($inp),%rax
--	add	\$8,$inp
--	mov	%rax,($out)
--	add	\$8,$out
-+	xor	($inp),%r8
-+	mov	%r8,($out,$inp)
-+	lea	8($inp),$inp
+     undef $label;
+     undef $opcode;
+-    undef $sz;
+     undef @args;
  
- 	test	\$-8,$len
--	jnz	.Lloop8
-+	jnz	.Loop8
-+	cmp	\$0,$len
-+	jne	.Lloop1
-+	jmp	.Lexit
+     if ($label=label->re(\$line))	{ print $label->out(); }
+ 
+     if (directive->re(\$line)) {
+ 	printf "%s",directive->out();
+-    } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
++    } elsif ($opcode=opcode->re(\$line)) {
++	my $asm = eval("\$".$opcode->mnemonic());
++	undef @bytes;
++	
++	if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
++	    print $gas?".byte\t":"DB\t",join(',', at bytes),"\n";
++	    next;
++	}
 +
-+.align	16
-+.Lintel:
-+	test	\$-32,$len
-+	jz	.Lloop1
-+	and	\$15,$TX[1]
-+	jz	.Loop16_is_hot
-+	sub	$TX[1],$len
-+.Loop16_warmup:
-+	add	$TX[0]#b,$YY#b
-+	movl	($dat,$YY,4),$TY#d
-+	movl	$TX[0]#d,($dat,$YY,4)
-+	movl	$TY#d,($dat,$XX[0],4)
-+	add	$TY#b,$TX[0]#b
-+	inc	$XX[0]#b
-+	movl	($dat,$TX[0],4),$TY#d
-+	movl	($dat,$XX[0],4),$TX[0]#d
-+	xorb	($inp),$TY#b
-+	movb	$TY#b,($out,$inp)
-+	lea	1($inp),$inp
-+	dec	$TX[1]
-+	jnz	.Loop16_warmup
++	ARGUMENT: while (1) {
+ 	my $arg;
+ 
+ 	if ($arg=register->re(\$line))	{ opcode->size($arg->size()); }
+@@ -664,19 +821,26 @@ while($line=<>) {
+ 	$line =~ s/^,\s*//;
+ 	} # ARGUMENT:
+ 
+-	$sz=opcode->size();
+-
+ 	if ($#args>=0) {
+ 	    my $insn;
++	    my $sz=opcode->size();
 +
-+	mov	$YY,$TX[1]
-+	xor	$YY,$YY
-+	mov	$TX[1]#b,$YY#b
+ 	    if ($gas) {
+ 		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
++		@args = map($_->out($sz), at args);
++		printf "\t%s\t%s",$insn,join(",", at args);
+ 	    } else {
+ 		$insn = $opcode->out();
+-		$insn .= $sz if (map($_->out() =~ /x?mm/, at args));
++		foreach (@args) {
++		    my $arg = $_->out();
++		    # $insn.=$sz compensates for movq, pinsrw, ...
++		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
++		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
++		}
+ 		@args = reverse(@args);
+ 		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
++		printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
+ 	    }
+-	    printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
+ 	} else {
+ 	    printf "\t%s",$opcode->out();
+ 	}
+diff -up openssl-1.0.0k/crypto/perlasm/x86asm.pl.intelopts openssl-1.0.0k/crypto/perlasm/x86asm.pl
+--- openssl-1.0.0k/crypto/perlasm/x86asm.pl.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/perlasm/x86asm.pl	2013-02-19 21:15:39.611407695 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+ 
+ # require 'x86asm.pl';
+ # &asm_init(<flavor>,"des-586.pl"[,$i386only]);
+@@ -80,6 +80,57 @@ sub ::movq
+     {	&::generic("movq", at _);			}
+ }
+ 
++# SSE>2 instructions
++my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
++		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
++sub ::pextrd
++{ my($dst,$src,$imm)=@_;
++    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
++    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
++    else
++    {	&::generic("pextrd", at _);		}
++}
 +
-+.Loop16_is_hot:
-+	lea	($dat,$XX[0],4),$XX[1]
-+___
-+sub RC4_loop {
-+  my $i=shift;
-+  my $j=$i<0?0:$i;
-+  my $xmm="%xmm".($j&1);
++sub ::pinsrd
++{ my($dst,$src,$imm)=@_;
++    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
++    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
++    else
++    {	&::generic("pinsrd", at _);		}
++}
 +
-+    $code.="	add	\$16,$XX[0]#b\n"		if ($i==15);
-+    $code.="	movdqu	($inp),%xmm2\n"			if ($i==15);
-+    $code.="	add	$TX[0]#b,$YY#b\n"		if ($i<=0);
-+    $code.="	movl	($dat,$YY,4),$TY#d\n";
-+    $code.="	pxor	%xmm0,%xmm2\n"			if ($i==0);
-+    $code.="	psllq	\$8,%xmm1\n"			if ($i==0);
-+    $code.="	pxor	$xmm,$xmm\n"			if ($i<=1);
-+    $code.="	movl	$TX[0]#d,($dat,$YY,4)\n";
-+    $code.="	add	$TY#b,$TX[0]#b\n";
-+    $code.="	movl	`4*($j+1)`($XX[1]),$TX[1]#d\n"	if ($i<15);
-+    $code.="	movz	$TX[0]#b,$TX[0]#d\n";
-+    $code.="	movl	$TY#d,`4*$j`($XX[1])\n";
-+    $code.="	pxor	%xmm1,%xmm2\n"			if ($i==0);
-+    $code.="	lea	($dat,$XX[0],4),$XX[1]\n"	if ($i==15);
-+    $code.="	add	$TX[1]#b,$YY#b\n"		if ($i<15);
-+    $code.="	pinsrw	\$`$j>>1`,($dat,$TX[0],4),$xmm\n";
-+    $code.="	movdqu	%xmm2,($out,$inp)\n"		if ($i==0);
-+    $code.="	lea	16($inp),$inp\n"		if ($i==0);
-+    $code.="	movl	($XX[1]),$TX[1]#d\n"		if ($i==15);
++sub ::pshufb
++{ my($dst,$src)=@_;
++    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
++    else
++    {	&::generic("pshufb", at _);		}
 +}
-+	RC4_loop(-1);
-+$code.=<<___;
-+	jmp	.Loop16_enter
-+.align	16
-+.Loop16:
-+___
 +
-+for ($i=0;$i<16;$i++) {
-+    $code.=".Loop16_enter:\n"		if ($i==1);
-+	RC4_loop($i);
-+	push(@TX,shift(@TX)); 		# "rotate" registers
++sub ::palignr
++{ my($dst,$src,$imm)=@_;
++    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
++    else
++    {	&::generic("palignr", at _);		}
 +}
-+$code.=<<___;
-+	mov	$YY,$TX[1]
-+	xor	$YY,$YY			# keyword to partial register
-+	sub	\$16,$len
-+	mov	$TX[1]#b,$YY#b
-+	test	\$-16,$len
-+	jnz	.Loop16
 +
-+	psllq	\$8,%xmm1
-+	pxor	%xmm0,%xmm2
-+	pxor	%xmm1,%xmm2
-+	movdqu	%xmm2,($out,$inp)
-+	lea	16($inp),$inp
++sub ::pclmulqdq
++{ my($dst,$src,$imm)=@_;
++    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
++    else
++    {	&::generic("pclmulqdq", at _);		}
++}
 +
- 	cmp	\$0,$len
- 	jne	.Lloop1
- 	jmp	.Lexit
-@@ -152,9 +314,8 @@ $code.=<<___;
- 	movl	($dat,$TX[0],4),$TY#d
- 	movl	($dat,$XX[0],4),$TX[0]#d
- 	xorb	($inp),$TY#b
--	inc	$inp
--	movb	$TY#b,($out)
--	inc	$out
-+	movb	$TY#b,($out,$inp)
-+	lea	1($inp),$inp
- 	dec	$len
- 	jnz	.Lloop1
- 	jmp	.Lexit
-@@ -165,13 +326,11 @@ $code.=<<___;
- 	movzb	($dat,$XX[0]),$TX[0]#d
- 	test	\$-8,$len
- 	jz	.Lcloop1
--	cmpl	\$0,260($dat)
--	jnz	.Lcloop1
- 	jmp	.Lcloop8
- .align	16
- .Lcloop8:
--	mov	($inp),%eax
--	mov	4($inp),%ebx
-+	mov	($inp),%r8d
-+	mov	4($inp),%r9d
- ___
- # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
- for ($i=0;$i<4;$i++) {
-@@ -188,8 +347,8 @@ $code.=<<___;
- 	mov	$TX[0],$TX[1]
- .Lcmov$i:
- 	add	$TX[0]#b,$TY#b
--	xor	($dat,$TY),%al
--	ror	\$8,%eax
-+	xor	($dat,$TY),%r8b
-+	ror	\$8,%r8d
- ___
- push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
- }
-@@ -207,16 +366,16 @@ $code.=<<___;
- 	mov	$TX[0],$TX[1]
- .Lcmov$i:
- 	add	$TX[0]#b,$TY#b
--	xor	($dat,$TY),%bl
--	ror	\$8,%ebx
-+	xor	($dat,$TY),%r9b
-+	ror	\$8,%r9d
- ___
- push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
- }
- $code.=<<___;
- 	lea	-8($len),$len
--	mov	%eax,($out)
-+	mov	%r8d,($out)
- 	lea	8($inp),$inp
--	mov	%ebx,4($out)
-+	mov	%r9d,4($out)
- 	lea	8($out),$out
- 
- 	test	\$-8,$len
-@@ -229,6 +388,7 @@ $code.=<<___;
- .align	16
- .Lcloop1:
- 	add	$TX[0]#b,$YY#b
-+	movzb	$YY#b,$YY#d
- 	movzb	($dat,$YY),$TY#d
- 	movb	$TX[0]#b,($dat,$YY)
- 	movb	$TY#b,($dat,$XX[0])
-@@ -260,12 +420,12 @@ $code.=<<___;
- 	ret
- .size	RC4,.-RC4
- ___
++sub ::rdrand
++{ my ($dst)=@_;
++    if ($dst =~ /(e[a-dsd][ixp])/)
++    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
++    else
++    {	&::generic("rdrand", at _);	}
 +}
++
+ # label management
+ $lbdecor="L";		# local label decoration, set by package
+ $label="000";
+diff -up openssl-1.0.0k/crypto/perlasm/x86gas.pl.intelopts openssl-1.0.0k/crypto/perlasm/x86gas.pl
+--- openssl-1.0.0k/crypto/perlasm/x86gas.pl.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/perlasm/x86gas.pl	2013-02-19 21:15:39.617407816 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
  
- $idx="%r8";
- $ido="%r9";
- 
- $code.=<<___;
--.extern	OPENSSL_ia32cap_P
- .globl	RC4_set_key
- .type	RC4_set_key,\@function,3
- .align	16
-@@ -280,12 +440,9 @@ RC4_set_key:
- 	xor	%r11,%r11
- 
- 	mov	OPENSSL_ia32cap_P(%rip),$idx#d
--	bt	\$20,$idx#d
--	jnc	.Lw1stloop
--	bt	\$30,$idx#d
--	setc	$ido#b
--	mov	$ido#d,260($dat)
--	jmp	.Lc1stloop
-+	bt	\$20,$idx#d	# RC4_CHAR?
-+	jc	.Lc1stloop
-+	jmp	.Lw1stloop
+ package x86gas;
  
- .align	16
- .Lw1stloop:
-@@ -348,18 +505,20 @@ RC4_options:
- 	lea	.Lopts(%rip),%rax
- 	mov	OPENSSL_ia32cap_P(%rip),%edx
- 	bt	\$20,%edx
--	jnc	.Ldone
--	add	\$12,%rax
-+	jc	.L8xchar
- 	bt	\$30,%edx
- 	jnc	.Ldone
--	add	\$13,%rax
-+	add	\$25,%rax
-+	ret
-+.L8xchar:
-+	add	\$12,%rax
- .Ldone:
- 	ret
- .align	64
- .Lopts:
- .asciz	"rc4(8x,int)"
- .asciz	"rc4(8x,char)"
--.asciz	"rc4(1x,char)"
-+.asciz	"rc4(16x,int)"
- .asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
- .align	64
- .size	RC4_options,.-RC4_options
-@@ -497,8 +656,17 @@ key_se_handler:
- ___
+@@ -91,6 +91,7 @@ sub ::DWP
  }
+ sub ::QWP	{ &::DWP(@_);	}
+ sub ::BP	{ &::DWP(@_);	}
++sub ::WP	{ &::DWP(@_);	}
+ sub ::BC	{ @_;		}
+ sub ::DWC	{ @_;		}
  
--$code =~ s/#([bwd])/$1/gm;
-+sub reg_part {
-+my ($reg,$conv)=@_;
-+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
-+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
-+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
-+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
-+    return $reg;
-+}
+@@ -161,10 +162,16 @@ sub ::file_end
+ 	    {	push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
+ 	}
+     }
++    if (grep {/\b${nmdecor}OPENSSL_ia32cap_X\b/i} @out) {
++	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_X,8";
++	if ($::elf)	{ push (@out,"$tmp,4\n"); }
++	else		{ push (@out,"$tmp\n"); }
++    }
+     push(@out,$initseg) if ($initseg);
+ }
  
-+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
-+$code =~ s/\`([^\`]*)\`/eval $1/gem;
- $code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
+ sub ::data_byte	{   push(@out,".byte\t".join(',', at _)."\n");   }
++sub ::data_short{   push(@out,".value\t".join(',', at _)."\n");  }
+ sub ::data_word {   push(@out,".long\t".join(',', at _)."\n");   }
  
- print $code;
-diff -up openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl.intelopts openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl
---- openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl.intelopts	2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl	2011-08-24 12:50:56.000000000 +0200
+ sub ::align
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.intelopts openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.intelopts	2013-02-19 21:15:39.360402569 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl	2013-02-19 21:15:39.623407939 +0100
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env perl
 +#!/usr/bin/perl
@@ -3447,1906 +2999,2219 @@ diff -up openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl.intelopts openssl-1.0.0d/crypt
  &asciz	("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
  &align	(64);
  &function_end_B("RC4_options");
-diff -up openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl.intelopts openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl
---- openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl.intelopts	2010-01-17 17:58:56.000000000 +0100
-+++ openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl	2011-08-24 12:50:56.000000000 +0200
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.intelopts openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.intelopts	2013-02-19 21:15:39.360402569 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl	2013-02-19 21:15:39.621407898 +0100
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env perl
 +#!/usr/bin/perl
  #
  # ====================================================================
  # Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -16,7 +16,7 @@
- # There was suggestion to mechanically translate 32-bit code, but I
- # dismissed it, reasoning that x86_64 offers enough register bank
- # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
--# implementation:-) However! While 64-bit code does performs better
-+# implementation:-) However! While 64-bit code does perform better
- # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
- # x86_64 does offer larger *addressable* bank, but out-of-order core
- # reaches for even more registers through dynamic aliasing, and EM64T
-@@ -29,6 +29,38 @@
- # Xeon P4	+65%		+0%		9.9
- # Core2		+60%		+10%		7.0
+@@ -7,6 +7,8 @@
+ # details see http://www.openssl.org/~appro/cryptogams/.
+ # ====================================================================
+ #
++# July 2004
++#
+ # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
+ # "hand-coded assembler"] doesn't stand for the whole improvement
+ # coefficient. It turned out that eliminating RC4_CHAR from config
+@@ -19,6 +21,8 @@
+ # to operate on partial registers, it turned out to be the best bet.
+ # At least for AMD... How IA32E would perform remains to be seen...
+ 
++# November 2004
++#
+ # As was shown by Marc Bevand reordering of couple of load operations
+ # results in even higher performance gain of 3.3x:-) At least on
+ # Opteron... For reference, 1x in this case is RC4_CHAR C-code
+@@ -26,6 +30,8 @@
+ # Latter means that if you want to *estimate* what to expect from
+ # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
+ 
++# November 2004
++#
+ # Intel P4 EM64T core was found to run the AMD64 code really slow...
+ # The only way to achieve comparable performance on P4 was to keep
+ # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+@@ -33,10 +39,14 @@
+ # on either AMD and Intel platforms, I implement both cases. See
+ # rc4_skey.c for further details...
+ 
++# April 2005
++#
+ # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
+ # those with add/sub results in 50% performance improvement of folded
+ # loop...
+ 
++# May 2005
++#
+ # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
+ # performance by >30% [unlike P4 32-bit case that is]. But this is
+ # provided that loads are reordered even more aggressively! Both code
+@@ -50,6 +60,8 @@
+ # is not implemented, then this final RC4_CHAR code-path should be
+ # preferred, as it provides better *all-round* performance].
+ 
++# March 2007
++#
+ # Intel Core2 was observed to perform poorly on both code paths:-( It
+ # apparently suffers from some kind of partial register stall, which
+ # occurs in 64-bit mode only [as virtually identical 32-bit loop was
+@@ -58,6 +70,34 @@
+ # fit for Core2 and therefore the code was modified to skip cloop8 on
+ # this CPU.
  
-+# August 2009.
++# May 2010
 +#
-+# The code was revised to minimize code size and to maximize
-+# "distance" between instructions producing input to 'lea'
-+# instruction and the 'lea' instruction itself, which is essential
-+# for Intel Atom core.
++# Intel Westmere was observed to perform suboptimally. Adding yet
++# another movzb to cloop1 improved performance by almost 50%! Core2
++# performance is improved too, but nominally...
 +
-+# October 2010.
++# May 2011
 +#
-+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
-+# is to offload message schedule denoted by Wt in NIST specification,
-+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
-+# for background and implementation details. The only difference from
-+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
-+# to free temporary registers.
-+
-+# April 2011.
++# The only code path that was not modified is P4-specific one. Non-P4
++# Intel code path optimization is heavily based on submission by Maxim
++# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
++# some of the ideas even in attempt to optmize the original RC4_INT
++# code path... Current performance in cycles per processed byte (less
++# is better) and improvement coefficients relative to previous
++# version of this module are:
 +#
-+# Add AVX code path. See sha1-586.pl for further information.
-+
-+######################################################################
-+# Current performance is summarized in following table. Numbers are
-+# CPU clock cycles spent to process single byte (less is better).
++# Opteron	5.3/+0%
++# P4		6.5
++# Core2		6.2/+15%(*)
++# Westmere	4.2/+60%
++# Sandy Bridge	4.2/+120%
++# Atom		9.3/+80%
 +#
-+#		x86_64		SSSE3		AVX
-+# P4		9.8		-
-+# Opteron	6.6		-
-+# Core2		6.7		6.1/+10%	-
-+# Atom		11.0		9.7/+13%	-
-+# Westmere	7.1		5.6/+27%	-
-+# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
++# (*)	Note that Core2 result is ~15% lower than corresponding result
++#	for 32-bit code, meaning that it's possible to improve it,
++#	but more than likely at the cost of the others (see rc4-586.pl
++#	to get the idea)...
 +
  $flavour = shift;
  $output  = shift;
  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-@@ -40,6 +72,13 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
+@@ -76,13 +116,10 @@ $len="%rsi";	    # arg2
+ $inp="%rdx";	    # arg3
+ $out="%rcx";	    # arg4
  
-+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
-+	   $1>=2.19);
-+$avx=1 if (!$avx && $flavour =~ /nasm/ &&
-+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
-+	   $1>=2.03);
+- at XX=("%r8","%r10");
+- at TX=("%r9","%r11");
+-$YY="%r12";
+-$TY="%r13";
+-
++{
+ $code=<<___;
+ .text
++.extern	OPENSSL_ia32cap_P
+ 
+ .globl	RC4
+ .type	RC4,\@function,4
+@@ -95,48 +132,173 @@ RC4:	or	$len,$len
+ 	push	%r12
+ 	push	%r13
+ .Lprologue:
++	mov	$len,%r11
++	mov	$inp,%r12
++	mov	$out,%r13
++___
++my $len="%r11";		# reassign input arguments
++my $inp="%r12";
++my $out="%r13";
++
++my @XX=("%r10","%rsi");
++my @TX=("%rax","%rbx");
++my $YY="%rcx";
++my $TY="%rdx";
+ 
+-	add	\$8,$dat
+-	movl	-8($dat),$XX[0]#d
+-	movl	-4($dat),$YY#d
++$code.=<<___;
++	xor	$XX[0],$XX[0]
++	xor	$YY,$YY
++
++	lea	8($dat),$dat
++	mov	-8($dat),$XX[0]#b
++	mov	-4($dat),$YY#b
+ 	cmpl	\$-1,256($dat)
+ 	je	.LRC4_CHAR
++	mov	OPENSSL_ia32cap_P(%rip),%r8d
++	xor	$TX[1],$TX[1]
+ 	inc	$XX[0]#b
++	sub	$XX[0],$TX[1]
++	sub	$inp,$out
+ 	movl	($dat,$XX[0],4),$TX[0]#d
+-	test	\$-8,$len
++	test	\$-16,$len
+ 	jz	.Lloop1
+-	jmp	.Lloop8
++	bt	\$30,%r8d	# Intel CPU?
++	jc	.Lintel
++	and	\$7,$TX[1]
++	lea	1($XX[0]),$XX[1]
++	jz	.Loop8
++	sub	$TX[1],$len
++.Loop8_warmup:
++	add	$TX[0]#b,$YY#b
++	movl	($dat,$YY,4),$TY#d
++	movl	$TX[0]#d,($dat,$YY,4)
++	movl	$TY#d,($dat,$XX[0],4)
++	add	$TY#b,$TX[0]#b
++	inc	$XX[0]#b
++	movl	($dat,$TX[0],4),$TY#d
++	movl	($dat,$XX[0],4),$TX[0]#d
++	xorb	($inp),$TY#b
++	movb	$TY#b,($out,$inp)
++	lea	1($inp),$inp
++	dec	$TX[1]
++	jnz	.Loop8_warmup
++
++	lea	1($XX[0]),$XX[1]
++	jmp	.Loop8
+ .align	16
+-.Lloop8:
++.Loop8:
+ ___
+ for ($i=0;$i<8;$i++) {
++$code.=<<___ if ($i==7);
++	add	\$8,$XX[1]#b
++___
+ $code.=<<___;
+ 	add	$TX[0]#b,$YY#b
+-	mov	$XX[0],$XX[1]
+ 	movl	($dat,$YY,4),$TY#d
+-	ror	\$8,%rax			# ror is redundant when $i=0
+-	inc	$XX[1]#b
+-	movl	($dat,$XX[1],4),$TX[1]#d
+-	cmp	$XX[1],$YY
+ 	movl	$TX[0]#d,($dat,$YY,4)
+-	cmove	$TX[0],$TX[1]
+-	movl	$TY#d,($dat,$XX[0],4)
++	movl	`4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
++	ror	\$8,%r8				# ror is redundant when $i=0
++	movl	$TY#d,4*$i($dat,$XX[0],4)
+ 	add	$TX[0]#b,$TY#b
+-	movb	($dat,$TY,4),%al
++	movb	($dat,$TY,4),%r8b
+ ___
+-push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
++push(@TX,shift(@TX)); #push(@XX,shift(@XX));	# "rotate" registers
+ }
+ $code.=<<___;
+-	ror	\$8,%rax
++	add	\$8,$XX[0]#b
++	ror	\$8,%r8
+ 	sub	\$8,$len
+ 
+-	xor	($inp),%rax
+-	add	\$8,$inp
+-	mov	%rax,($out)
+-	add	\$8,$out
++	xor	($inp),%r8
++	mov	%r8,($out,$inp)
++	lea	8($inp),$inp
+ 
+ 	test	\$-8,$len
+-	jnz	.Lloop8
++	jnz	.Loop8
++	cmp	\$0,$len
++	jne	.Lloop1
++	jmp	.Lexit
++
++.align	16
++.Lintel:
++	test	\$-32,$len
++	jz	.Lloop1
++	and	\$15,$TX[1]
++	jz	.Loop16_is_hot
++	sub	$TX[1],$len
++.Loop16_warmup:
++	add	$TX[0]#b,$YY#b
++	movl	($dat,$YY,4),$TY#d
++	movl	$TX[0]#d,($dat,$YY,4)
++	movl	$TY#d,($dat,$XX[0],4)
++	add	$TY#b,$TX[0]#b
++	inc	$XX[0]#b
++	movl	($dat,$TX[0],4),$TY#d
++	movl	($dat,$XX[0],4),$TX[0]#d
++	xorb	($inp),$TY#b
++	movb	$TY#b,($out,$inp)
++	lea	1($inp),$inp
++	dec	$TX[1]
++	jnz	.Loop16_warmup
++
++	mov	$YY,$TX[1]
++	xor	$YY,$YY
++	mov	$TX[1]#b,$YY#b
++
++.Loop16_is_hot:
++	lea	($dat,$XX[0],4),$XX[1]
++___
++sub RC4_loop {
++  my $i=shift;
++  my $j=$i<0?0:$i;
++  my $xmm="%xmm".($j&1);
++
++    $code.="	add	\$16,$XX[0]#b\n"		if ($i==15);
++    $code.="	movdqu	($inp),%xmm2\n"			if ($i==15);
++    $code.="	add	$TX[0]#b,$YY#b\n"		if ($i<=0);
++    $code.="	movl	($dat,$YY,4),$TY#d\n";
++    $code.="	pxor	%xmm0,%xmm2\n"			if ($i==0);
++    $code.="	psllq	\$8,%xmm1\n"			if ($i==0);
++    $code.="	pxor	$xmm,$xmm\n"			if ($i<=1);
++    $code.="	movl	$TX[0]#d,($dat,$YY,4)\n";
++    $code.="	add	$TY#b,$TX[0]#b\n";
++    $code.="	movl	`4*($j+1)`($XX[1]),$TX[1]#d\n"	if ($i<15);
++    $code.="	movz	$TX[0]#b,$TX[0]#d\n";
++    $code.="	movl	$TY#d,`4*$j`($XX[1])\n";
++    $code.="	pxor	%xmm1,%xmm2\n"			if ($i==0);
++    $code.="	lea	($dat,$XX[0],4),$XX[1]\n"	if ($i==15);
++    $code.="	add	$TX[1]#b,$YY#b\n"		if ($i<15);
++    $code.="	pinsrw	\$`$j>>1`,($dat,$TX[0],4),$xmm\n";
++    $code.="	movdqu	%xmm2,($out,$inp)\n"		if ($i==0);
++    $code.="	lea	16($inp),$inp\n"		if ($i==0);
++    $code.="	movl	($XX[1]),$TX[1]#d\n"		if ($i==15);
++}
++	RC4_loop(-1);
++$code.=<<___;
++	jmp	.Loop16_enter
++.align	16
++.Loop16:
++___
++
++for ($i=0;$i<16;$i++) {
++    $code.=".Loop16_enter:\n"		if ($i==1);
++	RC4_loop($i);
++	push(@TX,shift(@TX)); 		# "rotate" registers
++}
++$code.=<<___;
++	mov	$YY,$TX[1]
++	xor	$YY,$YY			# keyword to partial register
++	sub	\$16,$len
++	mov	$TX[1]#b,$YY#b
++	test	\$-16,$len
++	jnz	.Loop16
 +
- open STDOUT,"| $^X $xlate $flavour $output";
- 
- $ctx="%rdi";	# 1st arg
-@@ -51,196 +90,994 @@ $ctx="%r8";
- $inp="%r9";
- $num="%r10";
- 
--$xi="%eax";
--$t0="%ebx";
--$t1="%ecx";
--$A="%edx";
--$B="%esi";
--$C="%edi";
--$D="%ebp";
--$E="%r11d";
--$T="%r12d";
--
-- at V=($A,$B,$C,$D,$E,$T);
-+$t0="%eax";
-+$t1="%ebx";
-+$t2="%ecx";
-+ at xi=("%edx","%ebp");
-+$A="%esi";
-+$B="%edi";
-+$C="%r11d";
-+$D="%r12d";
-+$E="%r13d";
- 
--sub PROLOGUE {
--my $func=shift;
--$code.=<<___;
--.globl	$func
--.type	$func,\@function,3
--.align	16
--$func:
--	push	%rbx
--	push	%rbp
--	push	%r12
--	mov	%rsp,%r11
--	mov	%rdi,$ctx	# reassigned argument
--	sub	\$`8+16*4`,%rsp
--	mov	%rsi,$inp	# reassigned argument
--	and	\$-64,%rsp
--	mov	%rdx,$num	# reassigned argument
--	mov	%r11,`16*4`(%rsp)
--.Lprologue:
--
--	mov	0($ctx),$A
--	mov	4($ctx),$B
--	mov	8($ctx),$C
--	mov	12($ctx),$D
--	mov	16($ctx),$E
--___
--}
--
--sub EPILOGUE {
--my $func=shift;
--$code.=<<___;
--	mov	`16*4`(%rsp),%rsi
--	mov	(%rsi),%r12
--	mov	8(%rsi),%rbp
--	mov	16(%rsi),%rbx
--	lea	24(%rsi),%rsp
--.Lepilogue:
--	ret
--.size	$func,.-$func
--___
--}
-+ at V=($A,$B,$C,$D,$E);
- 
- sub BODY_00_19 {
--my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
-+my ($i,$a,$b,$c,$d,$e)=@_;
- my $j=$i+1;
- $code.=<<___ if ($i==0);
--	mov	`4*$i`($inp),$xi	
--	`"bswap	$xi"	if(!defined($host))`
--	mov	$xi,`4*$i`(%rsp)
-+	mov	`4*$i`($inp),$xi[0]
-+	bswap	$xi[0]
-+	mov	$xi[0],`4*$i`(%rsp)
++	psllq	\$8,%xmm1
++	pxor	%xmm0,%xmm2
++	pxor	%xmm1,%xmm2
++	movdqu	%xmm2,($out,$inp)
++	lea	16($inp),$inp
++
+ 	cmp	\$0,$len
+ 	jne	.Lloop1
+ 	jmp	.Lexit
+@@ -152,9 +314,8 @@ $code.=<<___;
+ 	movl	($dat,$TX[0],4),$TY#d
+ 	movl	($dat,$XX[0],4),$TX[0]#d
+ 	xorb	($inp),$TY#b
+-	inc	$inp
+-	movb	$TY#b,($out)
+-	inc	$out
++	movb	$TY#b,($out,$inp)
++	lea	1($inp),$inp
+ 	dec	$len
+ 	jnz	.Lloop1
+ 	jmp	.Lexit
+@@ -165,13 +326,11 @@ $code.=<<___;
+ 	movzb	($dat,$XX[0]),$TX[0]#d
+ 	test	\$-8,$len
+ 	jz	.Lcloop1
+-	cmpl	\$0,260($dat)
+-	jnz	.Lcloop1
+ 	jmp	.Lcloop8
+ .align	16
+ .Lcloop8:
+-	mov	($inp),%eax
+-	mov	4($inp),%ebx
++	mov	($inp),%r8d
++	mov	4($inp),%r9d
  ___
- $code.=<<___ if ($i<15);
--	lea	0x5a827999($xi,$e),$f
- 	mov	$c,$t0
--	mov	`4*$j`($inp),$xi
--	mov	$a,$e
-+	mov	`4*$j`($inp),$xi[1]
-+	mov	$a,$t2
- 	xor	$d,$t0
--	`"bswap	$xi"	if(!defined($host))`	
--	rol	\$5,$e
-+	bswap	$xi[1]
-+	rol	\$5,$t2
-+	lea	0x5a827999($xi[0],$e),$e
- 	and	$b,$t0
--	mov	$xi,`4*$j`(%rsp)
--	add	$e,$f
-+	mov	$xi[1],`4*$j`(%rsp)
-+	add	$t2,$e
- 	xor	$d,$t0
- 	rol	\$30,$b
--	add	$t0,$f
-+	add	$t0,$e
+ # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
+ for ($i=0;$i<4;$i++) {
+@@ -188,8 +347,8 @@ $code.=<<___;
+ 	mov	$TX[0],$TX[1]
+ .Lcmov$i:
+ 	add	$TX[0]#b,$TY#b
+-	xor	($dat,$TY),%al
+-	ror	\$8,%eax
++	xor	($dat,$TY),%r8b
++	ror	\$8,%r8d
  ___
- $code.=<<___ if ($i>=15);
--	lea	0x5a827999($xi,$e),$f
--	mov	`4*($j%16)`(%rsp),$xi
-+	mov	`4*($j%16)`(%rsp),$xi[1]
- 	mov	$c,$t0
--	mov	$a,$e
--	xor	`4*(($j+2)%16)`(%rsp),$xi
-+	mov	$a,$t2
-+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
- 	xor	$d,$t0
--	rol	\$5,$e
--	xor	`4*(($j+8)%16)`(%rsp),$xi
-+	rol	\$5,$t2
-+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
- 	and	$b,$t0
--	add	$e,$f
--	xor	`4*(($j+13)%16)`(%rsp),$xi
-+	lea	0x5a827999($xi[0],$e),$e
-+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
- 	xor	$d,$t0
-+	rol	\$1,$xi[1]
-+	add	$t2,$e
- 	rol	\$30,$b
--	add	$t0,$f
--	rol	\$1,$xi
--	mov	$xi,`4*($j%16)`(%rsp)
-+	mov	$xi[1],`4*($j%16)`(%rsp)
-+	add	$t0,$e
+ push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+ }
+@@ -207,16 +366,16 @@ $code.=<<___;
+ 	mov	$TX[0],$TX[1]
+ .Lcmov$i:
+ 	add	$TX[0]#b,$TY#b
+-	xor	($dat,$TY),%bl
+-	ror	\$8,%ebx
++	xor	($dat,$TY),%r9b
++	ror	\$8,%r9d
  ___
-+unshift(@xi,pop(@xi));
+ push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
  }
+ $code.=<<___;
+ 	lea	-8($len),$len
+-	mov	%eax,($out)
++	mov	%r8d,($out)
+ 	lea	8($inp),$inp
+-	mov	%ebx,4($out)
++	mov	%r9d,4($out)
+ 	lea	8($out),$out
  
- sub BODY_20_39 {
--my ($i,$a,$b,$c,$d,$e,$f)=@_;
-+my ($i,$a,$b,$c,$d,$e)=@_;
- my $j=$i+1;
- my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
- $code.=<<___ if ($i<79);
--	lea	$K($xi,$e),$f
--	mov	`4*($j%16)`(%rsp),$xi
-+	mov	`4*($j%16)`(%rsp),$xi[1]
- 	mov	$c,$t0
--	mov	$a,$e
--	xor	`4*(($j+2)%16)`(%rsp),$xi
-+	mov	$a,$t2
-+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
- 	xor	$b,$t0
--	rol	\$5,$e
--	xor	`4*(($j+8)%16)`(%rsp),$xi
-+	rol	\$5,$t2
-+	lea	$K($xi[0],$e),$e
-+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
- 	xor	$d,$t0
--	add	$e,$f
--	xor	`4*(($j+13)%16)`(%rsp),$xi
-+	add	$t2,$e
-+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
- 	rol	\$30,$b
--	add	$t0,$f
--	rol	\$1,$xi
-+	add	$t0,$e
-+	rol	\$1,$xi[1]
- ___
- $code.=<<___ if ($i<76);
--	mov	$xi,`4*($j%16)`(%rsp)
-+	mov	$xi[1],`4*($j%16)`(%rsp)
- ___
- $code.=<<___ if ($i==79);
--	lea	$K($xi,$e),$f
- 	mov	$c,$t0
--	mov	$a,$e
-+	mov	$a,$t2
- 	xor	$b,$t0
--	rol	\$5,$e
-+	lea	$K($xi[0],$e),$e
-+	rol	\$5,$t2
- 	xor	$d,$t0
--	add	$e,$f
-+	add	$t2,$e
- 	rol	\$30,$b
--	add	$t0,$f
-+	add	$t0,$e
+ 	test	\$-8,$len
+@@ -229,6 +388,7 @@ $code.=<<___;
+ .align	16
+ .Lcloop1:
+ 	add	$TX[0]#b,$YY#b
++	movzb	$YY#b,$YY#d
+ 	movzb	($dat,$YY),$TY#d
+ 	movb	$TX[0]#b,($dat,$YY)
+ 	movb	$TY#b,($dat,$XX[0])
+@@ -260,12 +420,12 @@ $code.=<<___;
+ 	ret
+ .size	RC4,.-RC4
  ___
-+unshift(@xi,pop(@xi));
- }
++}
+ 
+ $idx="%r8";
+ $ido="%r9";
  
- sub BODY_40_59 {
--my ($i,$a,$b,$c,$d,$e,$f)=@_;
-+my ($i,$a,$b,$c,$d,$e)=@_;
- my $j=$i+1;
  $code.=<<___;
--	lea	0x8f1bbcdc($xi,$e),$f
--	mov	`4*($j%16)`(%rsp),$xi
--	mov	$b,$t0
--	mov	$b,$t1
--	xor	`4*(($j+2)%16)`(%rsp),$xi
--	mov	$a,$e
--	and	$c,$t0
--	xor	`4*(($j+8)%16)`(%rsp),$xi
--	or	$c,$t1
--	rol	\$5,$e
--	xor	`4*(($j+13)%16)`(%rsp),$xi
--	and	$d,$t1
--	add	$e,$f
--	rol	\$1,$xi
--	or	$t1,$t0
-+	mov	`4*($j%16)`(%rsp),$xi[1]
-+	mov	$c,$t0
-+	mov	$c,$t1
-+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
-+	and	$d,$t0
-+	mov	$a,$t2
-+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
-+	xor	$d,$t1
-+	lea	0x8f1bbcdc($xi[0],$e),$e
-+	rol	\$5,$t2
-+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
-+	add	$t0,$e
-+	and	$b,$t1
-+	rol	\$1,$xi[1]
-+	add	$t1,$e
- 	rol	\$30,$b
--	mov	$xi,`4*($j%16)`(%rsp)
--	add	$t0,$f
-+	mov	$xi[1],`4*($j%16)`(%rsp)
-+	add	$t2,$e
+-.extern	OPENSSL_ia32cap_P
+ .globl	RC4_set_key
+ .type	RC4_set_key,\@function,3
+ .align	16
+@@ -280,12 +440,9 @@ RC4_set_key:
+ 	xor	%r11,%r11
+ 
+ 	mov	OPENSSL_ia32cap_P(%rip),$idx#d
+-	bt	\$20,$idx#d
+-	jnc	.Lw1stloop
+-	bt	\$30,$idx#d
+-	setc	$ido#b
+-	mov	$ido#d,260($dat)
+-	jmp	.Lc1stloop
++	bt	\$20,$idx#d	# RC4_CHAR?
++	jc	.Lc1stloop
++	jmp	.Lw1stloop
+ 
+ .align	16
+ .Lw1stloop:
+@@ -348,18 +505,20 @@ RC4_options:
+ 	lea	.Lopts(%rip),%rax
+ 	mov	OPENSSL_ia32cap_P(%rip),%edx
+ 	bt	\$20,%edx
+-	jnc	.Ldone
+-	add	\$12,%rax
++	jc	.L8xchar
+ 	bt	\$30,%edx
+ 	jnc	.Ldone
+-	add	\$13,%rax
++	add	\$25,%rax
++	ret
++.L8xchar:
++	add	\$12,%rax
+ .Ldone:
+ 	ret
+ .align	64
+ .Lopts:
+ .asciz	"rc4(8x,int)"
+ .asciz	"rc4(8x,char)"
+-.asciz	"rc4(1x,char)"
++.asciz	"rc4(16x,int)"
+ .asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+ .align	64
+ .size	RC4_options,.-RC4_options
+@@ -497,8 +656,17 @@ key_se_handler:
  ___
-+unshift(@xi,pop(@xi));
  }
  
--$code=".text\n";
-+$code.=<<___;
-+.text
-+.extern	OPENSSL_ia32cap_X
-+
-+.globl	sha1_block_data_order
-+.type	sha1_block_data_order,\@function,3
-+.align	16
-+sha1_block_data_order:
-+	mov	OPENSSL_ia32cap_X+0(%rip),%r9d
-+	mov	OPENSSL_ia32cap_X+4(%rip),%r8d
-+	test	\$`1<<9`,%r8d		# check SSSE3 bit
-+	jz	.Lialu
-+___
-+$code.=<<___ if ($avx);
-+	and	\$`1<<28`,%r8d		# mask AVX bit
-+	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
-+	or	%r9d,%r8d
-+	cmp	\$`1<<28|1<<30`,%r8d
-+	je	_avx_shortcut
-+___
-+$code.=<<___;
-+	jmp	_ssse3_shortcut
+-$code =~ s/#([bwd])/$1/gm;
++sub reg_part {
++my ($reg,$conv)=@_;
++    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
++    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
++    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
++    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
++    return $reg;
++}
+ 
++$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
++$code =~ s/\`([^\`]*)\`/eval $1/gem;
+ $code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
+ 
+ print $code;
+diff -up openssl-1.0.0k/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0k/crypto/sha/asm/sha1-586.pl
+--- openssl-1.0.0k/crypto/sha/asm/sha1-586.pl.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/asm/sha1-586.pl	2013-02-19 21:15:39.633408143 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+ 
+ # ====================================================================
+ # [Re]written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
+@@ -12,6 +12,8 @@
+ # commentary below], and in 2006 the rest was rewritten in order to
+ # gain freedom to liberate licensing terms.
+ 
++# January, September 2004.
++#
+ # It was noted that Intel IA-32 C compiler generates code which
+ # performs ~30% *faster* on P4 CPU than original *hand-coded*
+ # SHA1 assembler implementation. To address this problem (and
+@@ -31,12 +33,92 @@
+ # ----------------------------------------------------------------
+ #					<appro at fy.chalmers.se>
+ 
++# August 2009.
++#
++# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
++# '(c&d) + (b&(c^d))', which allows to accumulate partial results
++# and lighten "pressure" on scratch registers. This resulted in
++# >12% performance improvement on contemporary AMD cores (with no
++# degradation on other CPUs:-). Also, the code was revised to maximize
++# "distance" between instructions producing input to 'lea' instruction
++# and the 'lea' instruction itself, which is essential for Intel Atom
++# core and resulted in ~15% improvement.
 +
-+.align	16
-+.Lialu:
-+	push	%rbx
-+	push	%rbp
-+	push	%r12
-+	push	%r13
-+	mov	%rsp,%r11
-+	mov	%rdi,$ctx	# reassigned argument
-+	sub	\$`8+16*4`,%rsp
-+	mov	%rsi,$inp	# reassigned argument
-+	and	\$-64,%rsp
-+	mov	%rdx,$num	# reassigned argument
-+	mov	%r11,`16*4`(%rsp)
-+.Lprologue:
++# October 2010.
++#
++# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
++# is to offload message schedule denoted by Wt in NIST specification,
++# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
++# and in SSE2 context was first explored by Dean Gaudet in 2004, see
++# http://arctic.org/~dean/crypto/sha1.html. Since then several things
++# have changed that made it interesting again:
++#
++# a) XMM units became faster and wider;
++# b) instruction set became more versatile;
++# c) an important observation was made by Max Locktykhin, which made
++#    it possible to reduce amount of instructions required to perform
++#    the operation in question, for further details see
++#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
 +
-+	mov	0($ctx),$A
-+	mov	4($ctx),$B
-+	mov	8($ctx),$C
-+	mov	12($ctx),$D
-+	mov	16($ctx),$E
-+	jmp	.Lloop
- 
--&PROLOGUE("sha1_block_data_order");
--$code.=".align	4\n.Lloop:\n";
-+.align	16
-+.Lloop:
-+___
- for($i=0;$i<20;$i++)	{ &BODY_00_19($i, at V); unshift(@V,pop(@V)); }
- for(;$i<40;$i++)	{ &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
- for(;$i<60;$i++)	{ &BODY_40_59($i, at V); unshift(@V,pop(@V)); }
- for(;$i<80;$i++)	{ &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
- $code.=<<___;
--	add	0($ctx),$E
--	add	4($ctx),$T
--	add	8($ctx),$A
--	add	12($ctx),$B
--	add	16($ctx),$C
--	mov	$E,0($ctx)
--	mov	$T,4($ctx)
--	mov	$A,8($ctx)
--	mov	$B,12($ctx)
--	mov	$C,16($ctx)
--
--	xchg	$E,$A	# mov	$E,$A
--	xchg	$T,$B	# mov	$T,$B
--	xchg	$E,$C	# mov	$A,$C
--	xchg	$T,$D	# mov	$B,$D
--			# mov	$C,$E
--	lea	`16*4`($inp),$inp
-+	add	0($ctx),$A
-+	add	4($ctx),$B
-+	add	8($ctx),$C
-+	add	12($ctx),$D
-+	add	16($ctx),$E
-+	mov	$A,0($ctx)
-+	mov	$B,4($ctx)
-+	mov	$C,8($ctx)
-+	mov	$D,12($ctx)
-+	mov	$E,16($ctx)
++# April 2011.
++#
++# Add AVX code path, probably most controversial... The thing is that
++# switch to AVX alone improves performance by as little as 4% in
++# comparison to SSSE3 code path. But below result doesn't look like
++# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
++# pair of µ-ops, and it's the additional µ-ops, two per round, that
++# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
++# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
++# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
++# cycles per processed byte. But 'sh[rl]d' is not something that used
++# to be fast, nor does it appear to be fast in upcoming Bulldozer
++# [according to its optimization manual]. Which is why AVX code path
++# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
++# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
++# makes no sense to keep the AVX code path. If somebody feels that
++# strongly, it's probably more appropriate to discuss possibility of
++# using vector rotate XOP on AMD...
 +
- 	sub	\$1,$num
-+	lea	`16*4`($inp),$inp
- 	jnz	.Lloop
++######################################################################
++# Current performance is summarized in following table. Numbers are
++# CPU clock cycles spent to process single byte (less is better).
++#
++#		x86		SSSE3		AVX
++# Pentium	15.7		-
++# PIII		11.5		-
++# P4		10.6		-
++# AMD K8	7.1		-
++# Core2		7.3		6.1/+20%	-
++# Atom		12.5		9.5(*)/+32%	-
++# Westmere	7.3		5.6/+30%	-
++# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
++#
++# (*)	Loop is 1056 instructions long and expected result is ~8.25.
++#	It remains mystery [to me] why ILP is limited to 1.7.
++#
++# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
 +
-+	mov	`16*4`(%rsp),%rsi
-+	mov	(%rsi),%r13
-+	mov	8(%rsi),%r12
-+	mov	16(%rsi),%rbp
-+	mov	24(%rsi),%rbx
-+	lea	32(%rsi),%rsp
-+.Lepilogue:
-+	ret
-+.size	sha1_block_data_order,.-sha1_block_data_order
- ___
--&EPILOGUE("sha1_block_data_order");
-+{{{
-+my $Xi=4;
-+my @X=map("%xmm$_",(4..7,0..3));
-+my @Tx=map("%xmm$_",(8..10));
-+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
-+my @T=("%esi","%edi");
-+my $j=0;
-+my $K_XX_XX="%r11";
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ push(@INC,"${dir}","${dir}../../perlasm");
+ require "x86asm.pl";
+ 
+ &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
+ 
++$xmm=1; $ymm=0;
++for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
 +
-+my $_rol=sub { &rol(@_) };
-+my $_ror=sub { &ror(@_) };
++$ymm=1 if ($xmm &&
++		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
++			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
++		$1>=2.19);	# first version supporting AVX
 +
- $code.=<<___;
--.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-+.type	sha1_block_data_order_ssse3,\@function,3
- .align	16
-+sha1_block_data_order_ssse3:
-+_ssse3_shortcut:
-+	push	%rbx
-+	push	%rbp
-+	push	%r12
-+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
-+___
-+$code.=<<___ if ($win64);
-+	movaps	%xmm6,64+0(%rsp)
-+	movaps	%xmm7,64+16(%rsp)
-+	movaps	%xmm8,64+32(%rsp)
-+	movaps	%xmm9,64+48(%rsp)
-+	movaps	%xmm10,64+64(%rsp)
-+.Lprologue_ssse3:
-+___
-+$code.=<<___;
-+	mov	%rdi,$ctx	# reassigned argument
-+	mov	%rsi,$inp	# reassigned argument
-+	mov	%rdx,$num	# reassigned argument
++$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 
++		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
++		$1>=2.03);	# first version supporting AVX
 +
-+	shl	\$6,$num
-+	add	$inp,$num
-+	lea	K_XX_XX(%rip),$K_XX_XX
++&external_label("OPENSSL_ia32cap_X") if ($xmm);
 +
-+	mov	0($ctx),$A		# load context
-+	mov	4($ctx),$B
-+	mov	8($ctx),$C
-+	mov	12($ctx),$D
-+	mov	$B, at T[0]		# magic seed
-+	mov	16($ctx),$E
 +
-+	movdqa	64($K_XX_XX), at X[2]	# pbswap mask
-+	movdqa	0($K_XX_XX), at Tx[1]	# K_00_19
-+	movdqu	0($inp), at X[-4&7]	# load input to %xmm[0-3]
-+	movdqu	16($inp), at X[-3&7]
-+	movdqu	32($inp), at X[-2&7]
-+	movdqu	48($inp), at X[-1&7]
-+	pshufb	@X[2], at X[-4&7]		# byte swap
-+	add	\$64,$inp
-+	pshufb	@X[2], at X[-3&7]
-+	pshufb	@X[2], at X[-2&7]
-+	pshufb	@X[2], at X[-1&7]
-+	paddd	@Tx[1], at X[-4&7]		# add K_00_19
-+	paddd	@Tx[1], at X[-3&7]
-+	paddd	@Tx[1], at X[-2&7]
-+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
-+	psubd	@Tx[1], at X[-4&7]		# restore X[]
-+	movdqa	@X[-3&7],16(%rsp)
-+	psubd	@Tx[1], at X[-3&7]
-+	movdqa	@X[-2&7],32(%rsp)
-+	psubd	@Tx[1], at X[-2&7]
-+	jmp	.Loop_ssse3
-+___
+ $A="eax";
+ $B="ebx";
+ $C="ecx";
+@@ -47,6 +129,10 @@ $tmp1="ebp";
+ 
+ @V=($A,$B,$C,$D,$E,$T);
+ 
++$alt=0;	# 1 denotes alternative IALU implementation, which performs
++	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
++	# Sandy Bridge...
 +
-+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
-+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
-+  my $arg = pop;
-+    $arg = "\$$arg" if ($arg*1 eq $arg);
-+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+ sub BODY_00_15
+ 	{
+ 	local($n,$a,$b,$c,$d,$e,$f)=@_;
+@@ -59,16 +145,18 @@ sub BODY_00_15
+ 	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
+ 	 &xor($f,$d);
+ 	&add($tmp1,$e);			# tmp1+=e;
+-	 &and($f,$b);
+-	&mov($e,&swtmp($n%16));		# e becomes volatile and is loaded
++	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
+ 	 				# with xi, also note that e becomes
+ 					# f in next round...
+-	 &xor($f,$d);			# f holds F_00_19(b,c,d)
++	&and($f,$b);
+ 	&rotr($b,2);			# b=ROTATE(b,30)
+-	 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
++	 &xor($f,$d);			# f holds F_00_19(b,c,d)
++	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
+ 
+-	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
++	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
++		      &add($f,$tmp1); }	# f+=tmp1
+ 	else        { &add($tmp1,$f); }	# f becomes a in next round
++	&mov($tmp1,$a)			if ($alt && $n==15);
+ 	}
+ 
+ sub BODY_16_19
+@@ -77,22 +165,41 @@ sub BODY_16_19
+ 
+ 	&comment("16_19 $n");
+ 
+-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
+-	 &mov($tmp1,$c);		# tmp1 to hold F_00_19(b,c,d)
+-	&xor($f,&swtmp(($n+2)%16));
+-	 &xor($tmp1,$d);
+-	&xor($f,&swtmp(($n+8)%16));
+-	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
+-	&rotr($b,2);			# b=ROTATE(b,30)
++if ($alt) {
++	&xor($c,$d);
++	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
++	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
++	 &xor($f,&swtmp(($n+8)%16));
++	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
++	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
++	&rotl($f,1);			# f=ROTATE(f,1)
++	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
++	&xor($c,$d);			# restore $c
++	 &mov($tmp1,$a);		# b in next round
++	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
++	 &mov(&swtmp($n%16),$f);	# xi=f
++	&rotl($a,5);			# ROTATE(a,5)
++	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
++	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
++	 &add($f,$a);			# f+=ROTATE(a,5)
++} else {
++	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
++	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
++	&xor($tmp1,$d);
++	 &xor($f,&swtmp(($n+8)%16));
++	&and($tmp1,$b);
+ 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+ 	&rotl($f,1);			# f=ROTATE(f,1)
+ 	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
+-	&mov(&swtmp($n%16),$f);		# xi=f
+-	&lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
+-	 &mov($e,$a);			# e becomes volatile
+-	&rotl($e,5);			# e=ROTATE(a,5)
+-	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
+-	&add($f,$e);			# f+=ROTATE(a,5)
++	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
++	 &mov($tmp1,$a);
++	&rotr($b,2);			# b=ROTATE(b,30)
++	 &mov(&swtmp($n%16),$f);	# xi=f
++	&rotl($tmp1,5);			# ROTATE(a,5)
++	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
++	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
++	 &add($f,$tmp1);		# f+=ROTATE(a,5)
++}
+ 	}
+ 
+ sub BODY_20_39
+@@ -102,21 +209,41 @@ sub BODY_20_39
+ 
+ 	&comment("20_39 $n");
+ 
++if ($alt) {
++	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
++	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
++	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
++	 &xor($f,&swtmp(($n+8)%16));
++	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
++	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
++	&rotl($f,1);			# f=ROTATE(f,1)
++	 &mov($tmp1,$a);		# b in next round
++	&rotr($b,7);			# b=ROTATE(b,30)
++	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
++	&rotl($a,5);			# ROTATE(a,5)
++	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
++	&and($tmp1,$b)			if($n==39);
++	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
++	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
++	 &add($f,$a);			# f+=ROTATE(a,5)
++	&rotr($a,5)			if ($n==79);
++} else {
+ 	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
+-	 &mov($f,&swtmp($n%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+-	&rotr($b,2);			# b=ROTATE(b,30)
+-	 &xor($f,&swtmp(($n+2)%16));
++	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+ 	&xor($tmp1,$c);
+ 	 &xor($f,&swtmp(($n+8)%16));
+ 	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
+ 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+ 	&rotl($f,1);			# f=ROTATE(f,1)
+-	 &add($tmp1,$e);
+-	&mov(&swtmp($n%16),$f);		# xi=f
+-	 &mov($e,$a);			# e becomes volatile
+-	&rotl($e,5);			# e=ROTATE(a,5)
+-	 &lea($f,&DWP($K,$f,$tmp1));	# f+=K_20_39+e
+-	&add($f,$e);			# f+=ROTATE(a,5)
++	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
++	&rotr($b,2);			# b=ROTATE(b,30)
++	 &mov($tmp1,$a);
++	&rotl($tmp1,5);			# ROTATE(a,5)
++	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
++	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
++	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
++	&add($f,$tmp1);			# f+=ROTATE(a,5)
 +}
-+
-+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
-+{ use integer;
-+  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
-+  my ($a,$b,$c,$d,$e);
-+
-+	&movdqa	(@X[0], at X[-3&7]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	&movdqa	(@Tx[0], at X[-1&7]);
-+	&palignr(@X[0], at X[-4&7],8);	# compose "X[-14]" in "X[0]"
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	  &paddd	(@Tx[1], at X[-1&7]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	&pxor	(@X[0], at X[-4&7]);	# "X[0]"^="X[-16]"
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&pxor	(@Tx[0], at X[-2&7]);	# "X[-3]"^"X[-8]"
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&pxor	(@X[0], at Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&movdqa	(@Tx[2], at X[0]);
-+	&movdqa	(@Tx[0], at X[0]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
-+	&paddd	(@X[0], at X[0]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&psrld	(@Tx[0],31);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	&movdqa	(@Tx[1], at Tx[2]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&psrld	(@Tx[2],30);
-+	&por	(@X[0], at Tx[0]);		# "X[0]"<<<=1
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&pslld	(@Tx[1],2);
-+	&pxor	(@X[0], at Tx[2]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+
-+	&pxor	(@X[0], at Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
-+
-+	 foreach (@insns) { eval; }	# remaining instructions [if any]
-+
-+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
-+		push(@Tx,shift(@Tx));
+ 	}
+ 
+ sub BODY_40_59
+@@ -125,41 +252,86 @@ sub BODY_40_59
+ 
+ 	&comment("40_59 $n");
+ 
+-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
+-	 &mov($tmp1,&swtmp(($n+2)%16));
+-	&xor($f,$tmp1);
+-	 &mov($tmp1,&swtmp(($n+8)%16));
+-	&xor($f,$tmp1);
+-	 &mov($tmp1,&swtmp(($n+13)%16));
+-	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
+-	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
++if ($alt) {
++	&add($e,$tmp1);			# e+=b&(c^d)
++	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
++	&mov($tmp1,$d);
++	 &xor($f,&swtmp(($n+8)%16));
++	&xor($c,$d);			# restore $c
++	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+ 	&rotl($f,1);			# f=ROTATE(f,1)
+-	 &or($tmp1,$c);
+-	&mov(&swtmp($n%16),$f);		# xi=f
+-	 &and($tmp1,$d);
+-	&lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
+-	 &mov($e,$b);			# e becomes volatile and is used
+-					# to calculate F_40_59(b,c,d)
++	 &and($tmp1,$c);
++	&rotr($b,7);			# b=ROTATE(b,30)
++	 &add($e,$tmp1);		# e+=c&d
++	&mov($tmp1,$a);			# b in next round
++	 &mov(&swtmp($n%16),$f);	# xi=f
++	&rotl($a,5);			# ROTATE(a,5)
++	 &xor($b,$c)			if ($n<59);
++	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
++	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
++	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
++	 &add($f,$a);			# f+=ROTATE(a,5)
++} else {
++	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
++	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
++	&xor($tmp1,$d);
++	 &xor($f,&swtmp(($n+8)%16));
++	&and($tmp1,$b);
++	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
++	&rotl($f,1);			# f=ROTATE(f,1)
++	 &add($tmp1,$e);		# b&(c^d)+=e
+ 	&rotr($b,2);			# b=ROTATE(b,30)
+-	 &and($e,$c);
+-	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)		
+-	 &mov($e,$a);
+-	&rotl($e,5);			# e=ROTATE(a,5)
+-	 &add($f,$tmp1);		# f+=tmp1;
++	 &mov($e,$a);			# e becomes volatile
++	&rotl($e,5);			# ROTATE(a,5)
++	 &mov(&swtmp($n%16),$f);	# xi=f
++	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
++	 &mov($tmp1,$c);
+ 	&add($f,$e);			# f+=ROTATE(a,5)
++	 &and($tmp1,$d);
++	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
++	 &add($f,$tmp1);		# f+=c&d
 +}
+ 	}
+ 
+ &function_begin("sha1_block_data_order");
++if ($xmm) {
++  &static_label("ssse3_shortcut");
++  &static_label("avx_shortcut")		if ($ymm);
++  &static_label("K_XX_XX");
 +
-+sub Xupdate_ssse3_32_79()
-+{ use integer;
-+  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
-+  my ($a,$b,$c,$d,$e);
-+
-+	&movdqa	(@Tx[0], at X[-1&7])	if ($Xi==8);
-+	 eval(shift(@insns));		# body_20_39
-+	&pxor	(@X[0], at X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
-+	&palignr(@Tx[0], at X[-2&7],8);	# compose "X[-6]"
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
++	&call	(&label("pic_point"));	# make it PIC!
++  &set_label("pic_point");
++	&blindpop($tmp1);
++	&picmeup($T,"OPENSSL_ia32cap_X",$tmp1,&label("pic_point"));
++	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
 +
-+	&pxor	(@X[0], at X[-7&7]);	# "X[0]"^="X[-28]"
-+	 eval(shift(@insns));
-+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
-+	if ($Xi%5) {
-+	  &movdqa	(@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
-+	} else {			# ... or load next one
-+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
++	&mov	($A,&DWP(0,$T));
++	&mov	($D,&DWP(4,$T));
++	&test	($D,1<<9);		# check SSSE3 bit
++	&jz	(&label("x86"));
++	&test	($A,1<<24);		# check FXSR bit
++	&jz	(&label("x86"));
++	if ($ymm) {
++		&and	($D,1<<28);		# mask AVX bit
++		&and	($A,1<<30);		# mask "Intel CPU" bit
++		&or	($A,$D);
++		&cmp	($A,1<<28|1<<30);
++		&je	(&label("avx_shortcut"));
 +	}
-+	  &paddd	(@Tx[1], at X[-1&7]);
-+	 eval(shift(@insns));		# ror
-+	 eval(shift(@insns));
-+
-+	&pxor	(@X[0], at Tx[0]);		# "X[0]"^="X[-6]"
-+	 eval(shift(@insns));		# body_20_39
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
-+
-+	&movdqa	(@Tx[0], at X[0]);
-+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));		# ror
-+	 eval(shift(@insns));
-+
-+	&pslld	(@X[0],2);
-+	 eval(shift(@insns));		# body_20_39
-+	 eval(shift(@insns));
-+	&psrld	(@Tx[0],30);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));		# ror
-+	 eval(shift(@insns));
-+
-+	&por	(@X[0], at Tx[0]);		# "X[0]"<<<=2
-+	 eval(shift(@insns));		# body_20_39
-+	 eval(shift(@insns));
-+	  &movdqa	(@Tx[1], at X[0])	if ($Xi<19);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
-+	 eval(shift(@insns));
-+
-+	 foreach (@insns) { eval; }	# remaining instructions
-+
-+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
-+		push(@Tx,shift(@Tx));
++	&jmp	(&label("ssse3_shortcut"));
++  &set_label("x86",16);
 +}
+ 	&mov($tmp1,&wparam(0));	# SHA_CTX *c
+ 	&mov($T,&wparam(1));	# const void *input
+ 	&mov($A,&wparam(2));	# size_t num
+-	&stack_push(16);	# allocate X[16]
++	&stack_push(16+3);	# allocate X[16]
+ 	&shl($A,6);
+ 	&add($A,$T);
+ 	&mov(&wparam(2),$A);	# pointer beyond the end of input
+ 	&mov($E,&DWP(16,$tmp1));# pre-load E
++	&jmp(&label("loop"));
+ 
+-	&set_label("loop",16);
++&set_label("loop",16);
+ 
+ 	# copy input chunk to X, but reversing byte order!
+ 	for ($i=0; $i<16; $i+=4)
+@@ -213,8 +385,845 @@ sub BODY_40_59
+ 	&mov(&DWP(16,$tmp1),$C);
+ 	&jb(&label("loop"));
+ 
+-	&stack_pop(16);
++	&stack_pop(16+3);
+ &function_end("sha1_block_data_order");
 +
-+sub Xuplast_ssse3_80()
-+{ use integer;
-+  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
-+  my ($a,$b,$c,$d,$e);
++if ($xmm) {
++######################################################################
++# The SSSE3 implementation.
++#
++# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
++# 32 elements of the message schedule or Xupdate outputs. First 4
++# quadruples are simply byte-swapped input, next 4 are calculated
++# according to method originally suggested by Dean Gaudet (modulo
++# being implemented in SSSE3). Once 8 quadruples or 32 elements are
++# collected, it switches to routine proposed by Max Locktyukhin.
++#
++# Calculations inevitably require temporary reqisters, and there are
++# no %xmm registers left to spare. For this reason part of the ring
++# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
++# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
++# X[-5], and X[4] - X[-4]...
++#
++# Another notable optimization is aggressive stack frame compression
++# aiming to minimize amount of 9-byte instructions...
++#
++# Yet another notable optimization is "jumping" $B variable. It means
++# that there is no register permanently allocated for $B value. This
++# allowed to eliminate one instruction from body_20_39...
++#
++my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
++my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
++my @V=($A,$B,$C,$D,$E);
++my $j=0;			# hash round
++my @T=($T,$tmp1);
++my $inp;
 +
-+	 eval(shift(@insns));
-+	  &paddd	(@Tx[1], at X[-1&7]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
++my $_rol=sub { &rol(@_) };
++my $_ror=sub { &ror(@_) };
 +
-+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer IALU
++&function_begin("_sha1_block_data_order_ssse3");
++	&call	(&label("pic_point"));	# make it PIC!
++	&set_label("pic_point");
++	&blindpop($tmp1);
++	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
++&set_label("ssse3_shortcut");
 +
-+	 foreach (@insns) { eval; }		# remaining instructions
++	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
++	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
++	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
++	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
++	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
 +
-+	&cmp	($inp,$num);
-+	&je	(".Ldone_ssse3");
++	&mov	($E,&wparam(0));		# load argument block
++	&mov	($inp=@T[1],&wparam(1));
++	&mov	($D,&wparam(2));
++	&mov	(@T[0],"esp");
 +
-+	unshift(@Tx,pop(@Tx));
++	# stack frame layout
++	#
++	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
++	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
++	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
++	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
++	#
++	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
++	#	X[4]	X[5]	X[6]	X[7]
++	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
++	#
++	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
++	#	K_40_59	K_40_59	K_40_59	K_40_59
++	#	K_60_79	K_60_79	K_60_79	K_60_79
++	#	K_00_19	K_00_19	K_00_19	K_00_19
++	#	pbswap mask
++	#
++	# +192	ctx				# argument block
++	# +196	inp
++	# +200	end
++	# +204	esp
++	&sub	("esp",208);
++	&and	("esp",-64);
 +
-+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
-+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
-+	&movdqu	(@X[-4&7],"0($inp)");		# load input
-+	&movdqu	(@X[-3&7],"16($inp)");
-+	&movdqu	(@X[-2&7],"32($inp)");
-+	&movdqu	(@X[-1&7],"48($inp)");
-+	&pshufb	(@X[-4&7], at X[2]);		# byte swap
++	&movdqa	(&QWP(112+0,"esp"), at X[4]);	# copy constants
++	&movdqa	(&QWP(112+16,"esp"), at X[5]);
++	&movdqa	(&QWP(112+32,"esp"), at X[6]);
++	&shl	($D,6);				# len*64
++	&movdqa	(&QWP(112+48,"esp"), at X[3]);
++	&add	($D,$inp);			# end of input
++	&movdqa	(&QWP(112+64,"esp"), at X[2]);
 +	&add	($inp,64);
++	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
++	&mov	(&DWP(192+4,"esp"),$inp);
++	&mov	(&DWP(192+8,"esp"),$D);
++	&mov	(&DWP(192+12,"esp"), at T[0]);	# save original %esp
 +
-+  $Xi=0;
-+}
++	&mov	($A,&DWP(0,$E));		# load context
++	&mov	($B,&DWP(4,$E));
++	&mov	($C,&DWP(8,$E));
++	&mov	($D,&DWP(12,$E));
++	&mov	($E,&DWP(16,$E));
++	&mov	(@T[0],$B);			# magic seed
 +
-+sub Xloop_ssse3()
++	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
++	&movdqu	(@X[-3&7],&QWP(-48,$inp));
++	&movdqu	(@X[-2&7],&QWP(-32,$inp));
++	&movdqu	(@X[-1&7],&QWP(-16,$inp));
++	&pshufb	(@X[-4&7], at X[2]);		# byte swap
++	&pshufb	(@X[-3&7], at X[2]);
++	&pshufb	(@X[-2&7], at X[2]);
++	&movdqa	(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
++	&pshufb	(@X[-1&7], at X[2]);
++	&paddd	(@X[-4&7], at X[3]);		# add K_00_19
++	&paddd	(@X[-3&7], at X[3]);
++	&paddd	(@X[-2&7], at X[3]);
++	&movdqa	(&QWP(0,"esp"), at X[-4&7]);	# X[]+K xfer to IALU
++	&psubd	(@X[-4&7], at X[3]);		# restore X[]
++	&movdqa	(&QWP(0+16,"esp"), at X[-3&7]);
++	&psubd	(@X[-3&7], at X[3]);
++	&movdqa	(&QWP(0+32,"esp"), at X[-2&7]);
++	&psubd	(@X[-2&7], at X[3]);
++	&movdqa	(@X[0], at X[-3&7]);
++	&jmp	(&label("loop"));
++
++######################################################################
++# SSE instruction sequence is first broken to groups of indepentent
++# instructions, independent in respect to their inputs and shifter
++# (not all architectures have more than one). Then IALU instructions
++# are "knitted in" between the SSE groups. Distance is maintained for
++# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
++# [which allegedly also implements SSSE3]...
++#
++# Temporary registers usage. X[2] is volatile at the entry and at the
++# end is restored from backtrace ring buffer. X[3] is expected to
++# contain current K_XX_XX constant and is used to caclulate X[-1]+K
++# from previous round, it becomes volatile the moment the value is
++# saved to stack for transfer to IALU. X[4] becomes volatile whenever
++# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
++# end it is loaded with next K_XX_XX [which becomes X[3] in next
++# round]...
++#
++sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
 +{ use integer;
 +  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
 +  my ($a,$b,$c,$d,$e);
 +
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&pshufb	(@X[($Xi-3)&7], at X[2]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	&paddd	(@X[($Xi-4)&7], at Tx[1]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	&movdqa	(eval(16*$Xi)."(%rsp)", at X[($Xi-4)&7]);	# X[]+K xfer to IALU
++	&palignr(@X[0], at X[-4&7],8);	# compose "X[-14]" in "X[0]"
++	&movdqa	(@X[2], at X[-1&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&psubd	(@X[($Xi-4)&7], at Tx[1]);
-+
-+	foreach (@insns) { eval; }
-+  $Xi++;
-+}
-+
-+sub Xtail_ssse3()
-+{ use integer;
-+  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
-+  my ($a,$b,$c,$d,$e);
-+
-+	foreach (@insns) { eval; }
-+}
-+
-+sub body_00_19 () {
-+	(
-+	'($a,$b,$c,$d,$e)=@V;'.
-+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
-+	'&xor	($c,$d);',
-+	'&mov	(@T[1],$a);',	# $b in next round
-+	'&$_rol	($a,5);',
-+	'&and	(@T[0],$c);',	# ($b&($c^$d))
-+	'&xor	($c,$d);',	# restore $c
-+	'&xor	(@T[0],$d);',
-+	'&add	($e,$a);',
-+	'&$_ror	($b,$j?7:2);',	# $b>>>2
-+	'&add	($e, at T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
-+	);
-+}
-+
-+sub body_20_39 () {
-+	(
-+	'($a,$b,$c,$d,$e)=@V;'.
-+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
-+	'&xor	(@T[0],$d);',	# ($b^$d)
-+	'&mov	(@T[1],$a);',	# $b in next round
-+	'&$_rol	($a,5);',
-+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
-+	'&add	($e,$a);',
-+	'&$_ror	($b,7);',	# $b>>>2
-+	'&add	($e, at T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
-+	);
-+}
-+
-+sub body_40_59 () {
-+	(
-+	'($a,$b,$c,$d,$e)=@V;'.
-+	'&mov	(@T[1],$c);',
-+	'&xor	($c,$d);',
-+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
-+	'&and	(@T[1],$d);',
-+	'&and	(@T[0],$c);',	# ($b&($c^$d))
-+	'&$_ror	($b,7);',	# $b>>>2
-+	'&add	($e, at T[1]);',
-+	'&mov	(@T[1],$a);',	# $b in next round
-+	'&$_rol	($a,5);',
-+	'&add	($e, at T[0]);',
-+	'&xor	($c,$d);',	# restore $c
-+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
-+	);
-+}
-+$code.=<<___;
-+.align	16
-+.Loop_ssse3:
-+___
-+	&Xupdate_ssse3_16_31(\&body_00_19);
-+	&Xupdate_ssse3_16_31(\&body_00_19);
-+	&Xupdate_ssse3_16_31(\&body_00_19);
-+	&Xupdate_ssse3_16_31(\&body_00_19);
-+	&Xupdate_ssse3_32_79(\&body_00_19);
-+	&Xupdate_ssse3_32_79(\&body_20_39);
-+	&Xupdate_ssse3_32_79(\&body_20_39);
-+	&Xupdate_ssse3_32_79(\&body_20_39);
-+	&Xupdate_ssse3_32_79(\&body_20_39);
-+	&Xupdate_ssse3_32_79(\&body_20_39);
-+	&Xupdate_ssse3_32_79(\&body_40_59);
-+	&Xupdate_ssse3_32_79(\&body_40_59);
-+	&Xupdate_ssse3_32_79(\&body_40_59);
-+	&Xupdate_ssse3_32_79(\&body_40_59);
-+	&Xupdate_ssse3_32_79(\&body_40_59);
-+	&Xupdate_ssse3_32_79(\&body_20_39);
-+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
-+
-+				$saved_j=$j; @saved_V=@V;
 +
-+	&Xloop_ssse3(\&body_20_39);
-+	&Xloop_ssse3(\&body_20_39);
-+	&Xloop_ssse3(\&body_20_39);
++	  &paddd	(@X[3], at X[-1&7]);
++	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	&pxor	(@X[0], at X[-4&7]);	# "X[0]"^="X[-16]"
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+$code.=<<___;
-+	add	0($ctx),$A			# update context
-+	add	4($ctx), at T[0]
-+	add	8($ctx),$C
-+	add	12($ctx),$D
-+	mov	$A,0($ctx)
-+	add	16($ctx),$E
-+	mov	@T[0],4($ctx)
-+	mov	@T[0],$B			# magic seed
-+	mov	$C,8($ctx)
-+	mov	$D,12($ctx)
-+	mov	$E,16($ctx)
-+	jmp	.Loop_ssse3
++	&pxor	(@X[2], at X[-2&7]);	# "X[-3]"^"X[-8]"
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+.align	16
-+.Ldone_ssse3:
-+___
-+				$j=$saved_j; @V=@saved_V;
++	&pxor	(@X[0], at X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+	&Xtail_ssse3(\&body_20_39);
-+	&Xtail_ssse3(\&body_20_39);
-+	&Xtail_ssse3(\&body_20_39);
++	&movdqa	(@X[4], at X[0]);
++	&movdqa	(@X[2], at X[0]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+$code.=<<___;
-+	add	0($ctx),$A			# update context
-+	add	4($ctx), at T[0]
-+	add	8($ctx),$C
-+	mov	$A,0($ctx)
-+	add	12($ctx),$D
-+	mov	@T[0],4($ctx)
-+	add	16($ctx),$E
-+	mov	$C,8($ctx)
-+	mov	$D,12($ctx)
-+	mov	$E,16($ctx)
-+___
-+$code.=<<___ if ($win64);
-+	movaps	64+0(%rsp),%xmm6
-+	movaps	64+16(%rsp),%xmm7
-+	movaps	64+32(%rsp),%xmm8
-+	movaps	64+48(%rsp),%xmm9
-+	movaps	64+64(%rsp),%xmm10
-+___
-+$code.=<<___;
-+	lea	`64+($win64?6*16:0)`(%rsp),%rsi
-+	mov	0(%rsi),%r12
-+	mov	8(%rsi),%rbp
-+	mov	16(%rsi),%rbx
-+	lea	24(%rsi),%rsp
-+.Lepilogue_ssse3:
-+	ret
-+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
-+___
++	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
++	&paddd	(@X[0], at X[0]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+if ($avx) {
-+my $Xi=4;
-+my @X=map("%xmm$_",(4..7,0..3));
-+my @Tx=map("%xmm$_",(8..10));
-+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
-+my @T=("%esi","%edi");
-+my $j=0;
-+my $K_XX_XX="%r11";
++	&psrld	(@X[2],31);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	&movdqa	(@X[3], at X[4]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+my $_rol=sub { &shld(@_[0], at _) };
-+my $_ror=sub { &shrd(@_[0], at _) };
++	&psrld	(@X[4],30);
++	&por	(@X[0], at X[2]);		# "X[0]"<<<=1
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+$code.=<<___;
-+.type	sha1_block_data_order_avx,\@function,3
-+.align	16
-+sha1_block_data_order_avx:
-+_avx_shortcut:
-+	push	%rbx
-+	push	%rbp
-+	push	%r12
-+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
-+___
-+$code.=<<___ if ($win64);
-+	movaps	%xmm6,64+0(%rsp)
-+	movaps	%xmm7,64+16(%rsp)
-+	movaps	%xmm8,64+32(%rsp)
-+	movaps	%xmm9,64+48(%rsp)
-+	movaps	%xmm10,64+64(%rsp)
-+.Lprologue_avx:
-+___
-+$code.=<<___;
-+	mov	%rdi,$ctx	# reassigned argument
-+	mov	%rsi,$inp	# reassigned argument
-+	mov	%rdx,$num	# reassigned argument
-+	vzeroall
++	&pslld	(@X[3],2);
++	&pxor	(@X[0], at X[4]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+	shl	\$6,$num
-+	add	$inp,$num
-+	lea	K_XX_XX(%rip),$K_XX_XX
++	&pxor	(@X[0], at X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
++	  &movdqa	(@X[1], at X[-2&7])	if ($Xi<7);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+	mov	0($ctx),$A		# load context
-+	mov	4($ctx),$B
-+	mov	8($ctx),$C
-+	mov	12($ctx),$D
-+	mov	$B, at T[0]		# magic seed
-+	mov	16($ctx),$E
++	 foreach (@insns) { eval; }	# remaining instructions [if any]
 +
-+	vmovdqa	64($K_XX_XX), at X[2]	# pbswap mask
-+	vmovdqa	0($K_XX_XX), at Tx[1]	# K_00_19
-+	vmovdqu	0($inp), at X[-4&7]	# load input to %xmm[0-3]
-+	vmovdqu	16($inp), at X[-3&7]
-+	vmovdqu	32($inp), at X[-2&7]
-+	vmovdqu	48($inp), at X[-1&7]
-+	vpshufb	@X[2], at X[-4&7], at X[-4&7]	# byte swap
-+	add	\$64,$inp
-+	vpshufb	@X[2], at X[-3&7], at X[-3&7]
-+	vpshufb	@X[2], at X[-2&7], at X[-2&7]
-+	vpshufb	@X[2], at X[-1&7], at X[-1&7]
-+	vpaddd	@Tx[1], at X[-4&7], at X[0]	# add K_00_19
-+	vpaddd	@Tx[1], at X[-3&7], at X[1]
-+	vpaddd	@Tx[1], at X[-2&7], at X[2]
-+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
-+	vmovdqa	@X[1],16(%rsp)
-+	vmovdqa	@X[2],32(%rsp)
-+	jmp	.Loop_avx
-+___
++  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
++}
 +
-+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
++sub Xupdate_ssse3_32_79()
 +{ use integer;
 +  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
 +  my ($a,$b,$c,$d,$e);
 +
++	&movdqa	(@X[2], at X[-1&7])	if ($Xi==8);
++	 eval(shift(@insns));		# body_20_39
++	&pxor	(@X[0], at X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
++	&palignr(@X[2], at X[-2&7],8);	# compose "X[-6]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpalignr(@X[0], at X[-3&7], at X[-4&7],8);	# compose "X[-14]" in "X[0]"
++	 eval(shift(@insns));		# rol
++
++	&pxor	(@X[0], at X[-7&7]);	# "X[0]"^="X[-28]"
++	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);	# save X[] to backtrace buffer
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++	 if ($Xi%5) {
++	  &movdqa	(@X[4], at X[3]);	# "perpetuate" K_XX_XX...
++	 } else {			# ... or load next one
++	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
++	 }
++	  &paddd	(@X[3], at X[-1&7]);
++	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));
 +
-+	  &vpaddd	(@Tx[1], at Tx[1], at X[-1&7]);
++	&pxor	(@X[0], at X[2]);		# "X[0]"^="X[-6]"
++	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpsrldq(@Tx[0], at X[-1&7],4);	# "X[-3]", 3 dwords
++	 eval(shift(@insns));		# rol
++
++	&movdqa	(@X[2], at X[0]);
++	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpxor	(@X[0], at X[0], at X[-4&7]);		# "X[0]"^="X[-16]"
++	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));
++
++	&pslld	(@X[0],2);
++	 eval(shift(@insns));		# body_20_39
++	 eval(shift(@insns));
++	&psrld	(@X[2],30);
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));
 +
-+	&vpxor	(@Tx[0], at Tx[0], at X[-2&7]);	# "X[-3]"^"X[-8]"
++	&por	(@X[0], at X[2]);		# "X[0]"<<<=2
++	 eval(shift(@insns));		# body_20_39
++	 eval(shift(@insns));
++	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# rol
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# ror
++	  &movdqa	(@X[3], at X[0])	if ($Xi<19);
++	 eval(shift(@insns));
++
++	 foreach (@insns) { eval; }	# remaining instructions
++
++  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
++}
++
++sub Xuplast_ssse3_80()
++{ use integer;
++  my $body = shift;
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my ($a,$b,$c,$d,$e);
++
++	 eval(shift(@insns));
++	  &paddd	(@X[3], at X[-1&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpxor	(@X[0], at X[0], at Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
++	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer IALU
++
++	 foreach (@insns) { eval; }		# remaining instructions
++
++	&mov	($inp=@T[1],&DWP(192+4,"esp"));
++	&cmp	($inp,&DWP(192+8,"esp"));
++	&je	(&label("done"));
++
++	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
++	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
++	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
++	&movdqu	(@X[-3&7],&QWP(16,$inp));
++	&movdqu	(@X[-2&7],&QWP(32,$inp));
++	&movdqu	(@X[-1&7],&QWP(48,$inp));
++	&add	($inp,64);
++	&pshufb	(@X[-4&7], at X[2]);		# byte swap
++	&mov	(&DWP(192+4,"esp"),$inp);
++	&movdqa	(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
++
++  $Xi=0;
++}
++
++sub Xloop_ssse3()
++{ use integer;
++  my $body = shift;
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my ($a,$b,$c,$d,$e);
 +
-+	&vpsrld	(@Tx[0], at X[0],31);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+
-+	&vpslldq(@Tx[2], at X[0],12);		# "X[0]"<<96, extract one dword
-+	&vpaddd	(@X[0], at X[0], at X[0]);
++	&pshufb	(@X[($Xi-3)&7], at X[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++	&paddd	(@X[($Xi-4)&7], at X[3]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+
-+	&vpsrld	(@Tx[1], at Tx[2],30);
-+	&vpor	(@X[0], at X[0], at Tx[0]);		# "X[0]"<<<=1
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++	&movdqa	(&QWP(0+16*$Xi,"esp"), at X[($Xi-4)&7]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++	&psubd	(@X[($Xi-4)&7], at X[3]);
 +
-+	&vpslld	(@Tx[2], at Tx[2],2);
-+	&vpxor	(@X[0], at X[0], at Tx[1]);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
++	foreach (@insns) { eval; }
++  $Xi++;
++}
 +
-+	&vpxor	(@X[0], at X[0], at Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
++sub Xtail_ssse3()
++{ use integer;
++  my $body = shift;
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my ($a,$b,$c,$d,$e);
 +
++	foreach (@insns) { eval; }
++}
 +
-+	 foreach (@insns) { eval; }	# remaining instructions [if any]
++sub body_00_19 () {
++	(
++	'($a,$b,$c,$d,$e)=@V;'.
++	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
++	'&xor	($c,$d);',
++	'&mov	(@T[1],$a);',	# $b in next round
++	'&$_rol	($a,5);',
++	'&and	(@T[0],$c);',	# ($b&($c^$d))
++	'&xor	($c,$d);',	# restore $c
++	'&xor	(@T[0],$d);',
++	'&add	($e,$a);',
++	'&$_ror	($b,$j?7:2);',	# $b>>>2
++	'&add	($e, at T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
++	);
++}
 +
-+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
-+		push(@Tx,shift(@Tx));
++sub body_20_39 () {
++	(
++	'($a,$b,$c,$d,$e)=@V;'.
++	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
++	'&xor	(@T[0],$d);',	# ($b^$d)
++	'&mov	(@T[1],$a);',	# $b in next round
++	'&$_rol	($a,5);',
++	'&xor	(@T[0],$c);',	# ($b^$d^$c)
++	'&add	($e,$a);',
++	'&$_ror	($b,7);',	# $b>>>2
++	'&add	($e, at T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
++	);
 +}
 +
-+sub Xupdate_avx_32_79()
++sub body_40_59 () {
++	(
++	'($a,$b,$c,$d,$e)=@V;'.
++	'&mov	(@T[1],$c);',
++	'&xor	($c,$d);',
++	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
++	'&and	(@T[1],$d);',
++	'&and	(@T[0],$c);',	# ($b&($c^$d))
++	'&$_ror	($b,7);',	# $b>>>2
++	'&add	($e, at T[1]);',
++	'&mov	(@T[1],$a);',	# $b in next round
++	'&$_rol	($a,5);',
++	'&add	($e, at T[0]);',
++	'&xor	($c,$d);',	# restore $c
++	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
++	);
++}
++
++&set_label("loop",16);
++	&Xupdate_ssse3_16_31(\&body_00_19);
++	&Xupdate_ssse3_16_31(\&body_00_19);
++	&Xupdate_ssse3_16_31(\&body_00_19);
++	&Xupdate_ssse3_16_31(\&body_00_19);
++	&Xupdate_ssse3_32_79(\&body_00_19);
++	&Xupdate_ssse3_32_79(\&body_20_39);
++	&Xupdate_ssse3_32_79(\&body_20_39);
++	&Xupdate_ssse3_32_79(\&body_20_39);
++	&Xupdate_ssse3_32_79(\&body_20_39);
++	&Xupdate_ssse3_32_79(\&body_20_39);
++	&Xupdate_ssse3_32_79(\&body_40_59);
++	&Xupdate_ssse3_32_79(\&body_40_59);
++	&Xupdate_ssse3_32_79(\&body_40_59);
++	&Xupdate_ssse3_32_79(\&body_40_59);
++	&Xupdate_ssse3_32_79(\&body_40_59);
++	&Xupdate_ssse3_32_79(\&body_20_39);
++	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
++
++				$saved_j=$j; @saved_V=@V;
++
++	&Xloop_ssse3(\&body_20_39);
++	&Xloop_ssse3(\&body_20_39);
++	&Xloop_ssse3(\&body_20_39);
++
++	&mov	(@T[1],&DWP(192,"esp"));	# update context
++	&add	($A,&DWP(0, at T[1]));
++	&add	(@T[0],&DWP(4, at T[1]));		# $b
++	&add	($C,&DWP(8, at T[1]));
++	&mov	(&DWP(0, at T[1]),$A);
++	&add	($D,&DWP(12, at T[1]));
++	&mov	(&DWP(4, at T[1]), at T[0]);
++	&add	($E,&DWP(16, at T[1]));
++	&mov	(&DWP(8, at T[1]),$C);
++	&mov	($B, at T[0]);
++	&mov	(&DWP(12, at T[1]),$D);
++	&mov	(&DWP(16, at T[1]),$E);
++	&movdqa	(@X[0], at X[-3&7]);
++
++	&jmp	(&label("loop"));
++
++&set_label("done",16);		$j=$saved_j; @V=@saved_V;
++
++	&Xtail_ssse3(\&body_20_39);
++	&Xtail_ssse3(\&body_20_39);
++	&Xtail_ssse3(\&body_20_39);
++
++	&mov	(@T[1],&DWP(192,"esp"));	# update context
++	&add	($A,&DWP(0, at T[1]));
++	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
++	&add	(@T[0],&DWP(4, at T[1]));		# $b
++	&add	($C,&DWP(8, at T[1]));
++	&mov	(&DWP(0, at T[1]),$A);
++	&add	($D,&DWP(12, at T[1]));
++	&mov	(&DWP(4, at T[1]), at T[0]);
++	&add	($E,&DWP(16, at T[1]));
++	&mov	(&DWP(8, at T[1]),$C);
++	&mov	(&DWP(12, at T[1]),$D);
++	&mov	(&DWP(16, at T[1]),$E);
++
++&function_end("_sha1_block_data_order_ssse3");
++
++if ($ymm) {
++my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
++my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
++my @V=($A,$B,$C,$D,$E);
++my $j=0;			# hash round
++my @T=($T,$tmp1);
++my $inp;
++
++my $_rol=sub { &shld(@_[0], at _) };
++my $_ror=sub { &shrd(@_[0], at _) };
++
++&function_begin("_sha1_block_data_order_avx");
++	&call	(&label("pic_point"));	# make it PIC!
++	&set_label("pic_point");
++	&blindpop($tmp1);
++	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
++&set_label("avx_shortcut");
++	&vzeroall();
++
++	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
++	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
++	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
++	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
++	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
++
++	&mov	($E,&wparam(0));		# load argument block
++	&mov	($inp=@T[1],&wparam(1));
++	&mov	($D,&wparam(2));
++	&mov	(@T[0],"esp");
++
++	# stack frame layout
++	#
++	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
++	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
++	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
++	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
++	#
++	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
++	#	X[4]	X[5]	X[6]	X[7]
++	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
++	#
++	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
++	#	K_40_59	K_40_59	K_40_59	K_40_59
++	#	K_60_79	K_60_79	K_60_79	K_60_79
++	#	K_00_19	K_00_19	K_00_19	K_00_19
++	#	pbswap mask
++	#
++	# +192	ctx				# argument block
++	# +196	inp
++	# +200	end
++	# +204	esp
++	&sub	("esp",208);
++	&and	("esp",-64);
++
++	&vmovdqa(&QWP(112+0,"esp"), at X[4]);	# copy constants
++	&vmovdqa(&QWP(112+16,"esp"), at X[5]);
++	&vmovdqa(&QWP(112+32,"esp"), at X[6]);
++	&shl	($D,6);				# len*64
++	&vmovdqa(&QWP(112+48,"esp"), at X[3]);
++	&add	($D,$inp);			# end of input
++	&vmovdqa(&QWP(112+64,"esp"), at X[2]);
++	&add	($inp,64);
++	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
++	&mov	(&DWP(192+4,"esp"),$inp);
++	&mov	(&DWP(192+8,"esp"),$D);
++	&mov	(&DWP(192+12,"esp"), at T[0]);	# save original %esp
++
++	&mov	($A,&DWP(0,$E));		# load context
++	&mov	($B,&DWP(4,$E));
++	&mov	($C,&DWP(8,$E));
++	&mov	($D,&DWP(12,$E));
++	&mov	($E,&DWP(16,$E));
++	&mov	(@T[0],$B);			# magic seed
++
++	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
++	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
++	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
++	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
++	&vpshufb(@X[-4&7], at X[-4&7], at X[2]);	# byte swap
++	&vpshufb(@X[-3&7], at X[-3&7], at X[2]);
++	&vpshufb(@X[-2&7], at X[-2&7], at X[2]);
++	&vmovdqa(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
++	&vpshufb(@X[-1&7], at X[-1&7], at X[2]);
++	&vpaddd	(@X[0], at X[-4&7], at X[3]);		# add K_00_19
++	&vpaddd	(@X[1], at X[-3&7], at X[3]);
++	&vpaddd	(@X[2], at X[-2&7], at X[3]);
++	&vmovdqa(&QWP(0,"esp"), at X[0]);		# X[]+K xfer to IALU
++	&vmovdqa(&QWP(0+16,"esp"), at X[1]);
++	&vmovdqa(&QWP(0+32,"esp"), at X[2]);
++	&jmp	(&label("loop"));
++
++sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
 +{ use integer;
 +  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
++  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
 +  my ($a,$b,$c,$d,$e);
 +
-+	&vpalignr(@Tx[0], at X[-1&7], at X[-2&7],8);	# compose "X[-6]"
-+	&vpxor	(@X[0], at X[0], at X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
-+	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
-+
-+	&vpxor	(@X[0], at X[0], at X[-7&7]);		# "X[0]"^="X[-28]"
++	&vpalignr(@X[0], at X[-3&7], at X[-4&7],8);	# compose "X[-14]" in "X[0]"
 +	 eval(shift(@insns));
-+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
-+	if ($Xi%5) {
-+	  &vmovdqa	(@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
-+	} else {			# ... or load next one
-+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
-+	}
-+	  &vpaddd	(@Tx[1], at Tx[1], at X[-1&7]);
-+	 eval(shift(@insns));		# ror
 +	 eval(shift(@insns));
 +
-+	&vpxor	(@X[0], at X[0], at Tx[0]);		# "X[0]"^="X[-6]"
-+	 eval(shift(@insns));		# body_20_39
++	  &vpaddd	(@X[3], at X[3], at X[-1&7]);
++	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
-+
-+	&vpsrld	(@Tx[0], at X[0],30);
-+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
++	&vpsrldq(@X[2], at X[-1&7],4);		# "X[-3]", 3 dwords
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# ror
++	&vpxor	(@X[0], at X[0], at X[-4&7]);		# "X[0]"^="X[-16]"
 +	 eval(shift(@insns));
-+
-+	&vpslld	(@X[0], at X[0],2);
-+	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
++
++	&vpxor	(@X[2], at X[2], at X[-2&7]);		# "X[-3]"^"X[-8]"
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
++	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# ror
 +	 eval(shift(@insns));
 +
-+	&vpor	(@X[0], at X[0], at Tx[0]);		# "X[0]"<<<=2
-+	 eval(shift(@insns));		# body_20_39
-+	 eval(shift(@insns));
-+	  &vmovdqa	(@Tx[1], at X[0])	if ($Xi<19);
++	&vpxor	(@X[0], at X[0], at X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
 +
-+	 foreach (@insns) { eval; }	# remaining instructions
-+
-+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
-+		push(@Tx,shift(@Tx));
-+}
-+
-+sub Xuplast_avx_80()
-+{ use integer;
-+  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
-+  my ($a,$b,$c,$d,$e);
-+
-+	 eval(shift(@insns));
-+	  &vpaddd	(@Tx[1], at Tx[1], at X[-1&7]);
++	&vpsrld	(@X[2], at X[0],31);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer IALU
-+
-+	 foreach (@insns) { eval; }		# remaining instructions
-+
-+	&cmp	($inp,$num);
-+	&je	(".Ldone_avx");
-+
-+	unshift(@Tx,pop(@Tx));
-+
-+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
-+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
-+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
-+	&vmovdqu(@X[-3&7],"16($inp)");
-+	&vmovdqu(@X[-2&7],"32($inp)");
-+	&vmovdqu(@X[-1&7],"48($inp)");
-+	&vpshufb(@X[-4&7], at X[-4&7], at X[2]);	# byte swap
-+	&add	($inp,64);
-+
-+  $Xi=0;
-+}
++	&vpslldq(@X[4], at X[0],12);		# "X[0]"<<96, extract one dword
++	&vpaddd	(@X[0], at X[0], at X[0]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+sub Xloop_avx()
-+{ use integer;
-+  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
-+  my ($a,$b,$c,$d,$e);
++	&vpsrld	(@X[3], at X[4],30);
++	&vpor	(@X[0], at X[0], at X[2]);		# "X[0]"<<<=1
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
++	&vpslld	(@X[4], at X[4],2);
++	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpshufb(@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
++	&vpxor	(@X[0], at X[0], at X[3]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpaddd	(@X[$Xi&7], at X[($Xi-4)&7], at Tx[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++
++	&vpxor	(@X[0], at X[0], at X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vmovdqa(eval(16*$Xi)."(%rsp)", at X[$Xi&7]);	# X[]+K xfer to IALU
++	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	foreach (@insns) { eval; }
-+  $Xi++;
++	 foreach (@insns) { eval; }	# remaining instructions [if any]
++
++  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
 +}
 +
-+sub Xtail_avx()
++sub Xupdate_avx_32_79()
 +{ use integer;
 +  my $body = shift;
-+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
 +  my ($a,$b,$c,$d,$e);
 +
-+	foreach (@insns) { eval; }
-+}
-+
-+$code.=<<___;
-+.align	16
-+.Loop_avx:
-+___
-+	&Xupdate_avx_16_31(\&body_00_19);
-+	&Xupdate_avx_16_31(\&body_00_19);
-+	&Xupdate_avx_16_31(\&body_00_19);
-+	&Xupdate_avx_16_31(\&body_00_19);
-+	&Xupdate_avx_32_79(\&body_00_19);
-+	&Xupdate_avx_32_79(\&body_20_39);
-+	&Xupdate_avx_32_79(\&body_20_39);
-+	&Xupdate_avx_32_79(\&body_20_39);
-+	&Xupdate_avx_32_79(\&body_20_39);
-+	&Xupdate_avx_32_79(\&body_20_39);
-+	&Xupdate_avx_32_79(\&body_40_59);
-+	&Xupdate_avx_32_79(\&body_40_59);
-+	&Xupdate_avx_32_79(\&body_40_59);
-+	&Xupdate_avx_32_79(\&body_40_59);
-+	&Xupdate_avx_32_79(\&body_40_59);
-+	&Xupdate_avx_32_79(\&body_20_39);
-+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
-+
-+				$saved_j=$j; @saved_V=@V;
-+
-+	&Xloop_avx(\&body_20_39);
-+	&Xloop_avx(\&body_20_39);
-+	&Xloop_avx(\&body_20_39);
-+
-+$code.=<<___;
-+	add	0($ctx),$A			# update context
-+	add	4($ctx), at T[0]
-+	add	8($ctx),$C
-+	add	12($ctx),$D
-+	mov	$A,0($ctx)
-+	add	16($ctx),$E
-+	mov	@T[0],4($ctx)
-+	mov	@T[0],$B			# magic seed
-+	mov	$C,8($ctx)
-+	mov	$D,12($ctx)
-+	mov	$E,16($ctx)
-+	jmp	.Loop_avx
-+
-+.align	16
-+.Ldone_avx:
-+___
-+				$j=$saved_j; @V=@saved_V;
++	&vpalignr(@X[2], at X[-1&7], at X[-2&7],8);	# compose "X[-6]"
++	&vpxor	(@X[0], at X[0], at X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
++	 eval(shift(@insns));		# body_20_39
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# rol
 +
-+	&Xtail_avx(\&body_20_39);
-+	&Xtail_avx(\&body_20_39);
-+	&Xtail_avx(\&body_20_39);
++	&vpxor	(@X[0], at X[0], at X[-7&7]);	# "X[0]"^="X[-28]"
++	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);	# save X[] to backtrace buffer
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 if ($Xi%5) {
++	  &vmovdqa	(@X[4], at X[3]);	# "perpetuate" K_XX_XX...
++	 } else {			# ... or load next one
++	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
++	 }
++	  &vpaddd	(@X[3], at X[3], at X[-1&7]);
++	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));
 +
-+$code.=<<___;
-+	vzeroall
++	&vpxor	(@X[0], at X[0], at X[2]);		# "X[0]"^="X[-6]"
++	 eval(shift(@insns));		# body_20_39
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# rol
 +
-+	add	0($ctx),$A			# update context
-+	add	4($ctx), at T[0]
-+	add	8($ctx),$C
-+	mov	$A,0($ctx)
-+	add	12($ctx),$D
-+	mov	@T[0],4($ctx)
-+	add	16($ctx),$E
-+	mov	$C,8($ctx)
-+	mov	$D,12($ctx)
-+	mov	$E,16($ctx)
-+___
-+$code.=<<___ if ($win64);
-+	movaps	64+0(%rsp),%xmm6
-+	movaps	64+16(%rsp),%xmm7
-+	movaps	64+32(%rsp),%xmm8
-+	movaps	64+48(%rsp),%xmm9
-+	movaps	64+64(%rsp),%xmm10
-+___
-+$code.=<<___;
-+	lea	`64+($win64?6*16:0)`(%rsp),%rsi
-+	mov	0(%rsi),%r12
-+	mov	8(%rsi),%rbp
-+	mov	16(%rsi),%rbx
-+	lea	24(%rsi),%rsp
-+.Lepilogue_avx:
-+	ret
-+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
-+___
-+}
-+$code.=<<___;
-+.align	64
-+K_XX_XX:
-+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
-+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
-+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
-+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
-+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
-+___
-+}}}
-+$code.=<<___;
-+.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-+.align	64
- ___
- 
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-@@ -272,25 +1109,73 @@ se_handler:
- 
- 	lea	.Lprologue(%rip),%r10
- 	cmp	%r10,%rbx		# context->Rip<.Lprologue
--	jb	.Lin_prologue
-+	jb	.Lcommon_seh_tail
- 
- 	mov	152($context),%rax	# pull context->Rsp
- 
- 	lea	.Lepilogue(%rip),%r10
- 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
--	jae	.Lin_prologue
-+	jae	.Lcommon_seh_tail
- 
- 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
--	lea	24(%rax),%rax
-+	lea	32(%rax),%rax
- 
- 	mov	-8(%rax),%rbx
- 	mov	-16(%rax),%rbp
- 	mov	-24(%rax),%r12
-+	mov	-32(%rax),%r13
- 	mov	%rbx,144($context)	# restore context->Rbx
- 	mov	%rbp,160($context)	# restore context->Rbp
- 	mov	%r12,216($context)	# restore context->R12
-+	mov	%r13,224($context)	# restore context->R13
++	&vpsrld	(@X[2], at X[0],30);
++	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));
 +
-+	jmp	.Lcommon_seh_tail
-+.size	se_handler,.-se_handler
++	&vpslld	(@X[0], at X[0],2);
++	 eval(shift(@insns));		# body_20_39
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# rol
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));
 +
-+.type	ssse3_handler,\@abi-omnipotent
-+.align	16
-+ssse3_handler:
-+	push	%rsi
-+	push	%rdi
-+	push	%rbx
-+	push	%rbp
-+	push	%r12
-+	push	%r13
-+	push	%r14
-+	push	%r15
-+	pushfq
-+	sub	\$64,%rsp
++	&vpor	(@X[0], at X[0], at X[2]);	# "X[0]"<<<=2
++	 eval(shift(@insns));		# body_20_39
++	 eval(shift(@insns));
++	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# rol
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));
 +
-+	mov	120($context),%rax	# pull context->Rax
-+	mov	248($context),%rbx	# pull context->Rip
++	 foreach (@insns) { eval; }	# remaining instructions
 +
-+	mov	8($disp),%rsi		# disp->ImageBase
-+	mov	56($disp),%r11		# disp->HandlerData
++  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
++}
 +
-+	mov	0(%r11),%r10d		# HandlerData[0]
-+	lea	(%rsi,%r10),%r10	# prologue label
-+	cmp	%r10,%rbx		# context->Rip<prologue label
-+	jb	.Lcommon_seh_tail
++sub Xuplast_avx_80()
++{ use integer;
++  my $body = shift;
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my ($a,$b,$c,$d,$e);
 +
-+	mov	152($context),%rax	# pull context->Rsp
- 
--.Lin_prologue:
-+	mov	4(%r11),%r10d		# HandlerData[1]
-+	lea	(%rsi,%r10),%r10	# epilogue label
-+	cmp	%r10,%rbx		# context->Rip>=epilogue label
-+	jae	.Lcommon_seh_tail
++	 eval(shift(@insns));
++	  &vpaddd	(@X[3], at X[3], at X[-1&7]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
 +
-+	lea	64(%rax),%rsi
-+	lea	512($context),%rdi	# &context.Xmm6
-+	mov	\$10,%ecx
-+	.long	0xa548f3fc		# cld; rep movsq
-+	lea	24+5*16(%rax),%rax	# adjust stack pointer
++	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer IALU
 +
-+	mov	-8(%rax),%rbx
-+	mov	-16(%rax),%rbp
-+	mov	%rbx,144($context)	# restore context->Rbx
-+	mov	%rbp,160($context)	# restore context->Rbp
++	 foreach (@insns) { eval; }		# remaining instructions
 +
-+.Lcommon_seh_tail:
- 	mov	8(%rax),%rdi
- 	mov	16(%rax),%rsi
- 	mov	%rax,152($context)	# restore context->Rsp
-@@ -328,19 +1213,38 @@ se_handler:
- 	pop	%rdi
- 	pop	%rsi
- 	ret
--.size	se_handler,.-se_handler
-+.size	ssse3_handler,.-ssse3_handler
- 
- .section	.pdata
- .align	4
- 	.rva	.LSEH_begin_sha1_block_data_order
- 	.rva	.LSEH_end_sha1_block_data_order
- 	.rva	.LSEH_info_sha1_block_data_order
--
-+	.rva	.LSEH_begin_sha1_block_data_order_ssse3
-+	.rva	.LSEH_end_sha1_block_data_order_ssse3
-+	.rva	.LSEH_info_sha1_block_data_order_ssse3
-+___
-+$code.=<<___ if ($avx);
-+	.rva	.LSEH_begin_sha1_block_data_order_avx
-+	.rva	.LSEH_end_sha1_block_data_order_avx
-+	.rva	.LSEH_info_sha1_block_data_order_avx
-+___
-+$code.=<<___;
- .section	.xdata
- .align	8
- .LSEH_info_sha1_block_data_order:
- 	.byte	9,0,0,0
- 	.rva	se_handler
-+.LSEH_info_sha1_block_data_order_ssse3:
-+	.byte	9,0,0,0
-+	.rva	ssse3_handler
-+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
-+___
-+$code.=<<___ if ($avx);
-+.LSEH_info_sha1_block_data_order_avx:
-+	.byte	9,0,0,0
-+	.rva	ssse3_handler
-+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
- ___
- }
- 
-diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/crypto/sha/asm/sha1-586.pl
---- openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts	2008-07-17 11:50:56.000000000 +0200
-+++ openssl-1.0.0d/crypto/sha/asm/sha1-586.pl	2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
- 
- # ====================================================================
- # [Re]written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -12,6 +12,8 @@
- # commentary below], and in 2006 the rest was rewritten in order to
- # gain freedom to liberate licensing terms.
- 
-+# January, September 2004.
-+#
- # It was noted that Intel IA-32 C compiler generates code which
- # performs ~30% *faster* on P4 CPU than original *hand-coded*
- # SHA1 assembler implementation. To address this problem (and
-@@ -31,12 +33,92 @@
- # ----------------------------------------------------------------
- #					<appro at fy.chalmers.se>
- 
-+# August 2009.
-+#
-+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
-+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
-+# and lighten "pressure" on scratch registers. This resulted in
-+# >12% performance improvement on contemporary AMD cores (with no
-+# degradation on other CPUs:-). Also, the code was revised to maximize
-+# "distance" between instructions producing input to 'lea' instruction
-+# and the 'lea' instruction itself, which is essential for Intel Atom
-+# core and resulted in ~15% improvement.
++	&mov	($inp=@T[1],&DWP(192+4,"esp"));
++	&cmp	($inp,&DWP(192+8,"esp"));
++	&je	(&label("done"));
 +
-+# October 2010.
-+#
-+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
-+# is to offload message schedule denoted by Wt in NIST specification,
-+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
-+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
-+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
-+# have changed that made it interesting again:
-+#
-+# a) XMM units became faster and wider;
-+# b) instruction set became more versatile;
-+# c) an important observation was made by Max Locktykhin, which made
-+#    it possible to reduce amount of instructions required to perform
-+#    the operation in question, for further details see
-+#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
++	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
++	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
++	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
++	&vmovdqu(@X[-3&7],&QWP(16,$inp));
++	&vmovdqu(@X[-2&7],&QWP(32,$inp));
++	&vmovdqu(@X[-1&7],&QWP(48,$inp));
++	&add	($inp,64);
++	&vpshufb(@X[-4&7], at X[-4&7], at X[2]);		# byte swap
++	&mov	(&DWP(192+4,"esp"),$inp);
++	&vmovdqa(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
 +
-+# April 2011.
-+#
-+# Add AVX code path, probably most controversial... The thing is that
-+# switch to AVX alone improves performance by as little as 4% in
-+# comparison to SSSE3 code path. But below result doesn't look like
-+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
-+# pair of µ-ops, and it's the additional µ-ops, two per round, that
-+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
-+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
-+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
-+# cycles per processed byte. But 'sh[rl]d' is not something that used
-+# to be fast, nor does it appear to be fast in upcoming Bulldozer
-+# [according to its optimization manual]. Which is why AVX code path
-+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
-+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
-+# makes no sense to keep the AVX code path. If somebody feels that
-+# strongly, it's probably more appropriate to discuss possibility of
-+# using vector rotate XOP on AMD...
++  $Xi=0;
++}
 +
-+######################################################################
-+# Current performance is summarized in following table. Numbers are
-+# CPU clock cycles spent to process single byte (less is better).
-+#
-+#		x86		SSSE3		AVX
-+# Pentium	15.7		-
-+# PIII		11.5		-
-+# P4		10.6		-
-+# AMD K8	7.1		-
-+# Core2		7.3		6.1/+20%	-
-+# Atom		12.5		9.5(*)/+32%	-
-+# Westmere	7.3		5.6/+30%	-
-+# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
-+#
-+# (*)	Loop is 1056 instructions long and expected result is ~8.25.
-+#	It remains mystery [to me] why ILP is limited to 1.7.
-+#
-+# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
++sub Xloop_avx()
++{ use integer;
++  my $body = shift;
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my ($a,$b,$c,$d,$e);
++
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	&vpshufb	(@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	&vpaddd	(@X[$Xi&7], at X[($Xi-4)&7], at X[3]);
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++	&vmovdqa	(&QWP(0+16*$Xi,"esp"), at X[$Xi&7]);	# X[]+K xfer to IALU
++	 eval(shift(@insns));
++	 eval(shift(@insns));
++
++	foreach (@insns) { eval; }
++  $Xi++;
++}
++
++sub Xtail_avx()
++{ use integer;
++  my $body = shift;
++  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
++  my ($a,$b,$c,$d,$e);
++
++	foreach (@insns) { eval; }
++}
++
++&set_label("loop",16);
++	&Xupdate_avx_16_31(\&body_00_19);
++	&Xupdate_avx_16_31(\&body_00_19);
++	&Xupdate_avx_16_31(\&body_00_19);
++	&Xupdate_avx_16_31(\&body_00_19);
++	&Xupdate_avx_32_79(\&body_00_19);
++	&Xupdate_avx_32_79(\&body_20_39);
++	&Xupdate_avx_32_79(\&body_20_39);
++	&Xupdate_avx_32_79(\&body_20_39);
++	&Xupdate_avx_32_79(\&body_20_39);
++	&Xupdate_avx_32_79(\&body_20_39);
++	&Xupdate_avx_32_79(\&body_40_59);
++	&Xupdate_avx_32_79(\&body_40_59);
++	&Xupdate_avx_32_79(\&body_40_59);
++	&Xupdate_avx_32_79(\&body_40_59);
++	&Xupdate_avx_32_79(\&body_40_59);
++	&Xupdate_avx_32_79(\&body_20_39);
++	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
 +
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- push(@INC,"${dir}","${dir}../../perlasm");
- require "x86asm.pl";
- 
- &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
- 
-+$xmm=1; $ymm=0;
-+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
++				$saved_j=$j; @saved_V=@V;
 +
-+$ymm=1 if ($xmm &&
-+		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-+			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
-+		$1>=2.19);	# first version supporting AVX
++	&Xloop_avx(\&body_20_39);
++	&Xloop_avx(\&body_20_39);
++	&Xloop_avx(\&body_20_39);
 +
-+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 
-+		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
-+		$1>=2.03);	# first version supporting AVX
++	&mov	(@T[1],&DWP(192,"esp"));	# update context
++	&add	($A,&DWP(0, at T[1]));
++	&add	(@T[0],&DWP(4, at T[1]));		# $b
++	&add	($C,&DWP(8, at T[1]));
++	&mov	(&DWP(0, at T[1]),$A);
++	&add	($D,&DWP(12, at T[1]));
++	&mov	(&DWP(4, at T[1]), at T[0]);
++	&add	($E,&DWP(16, at T[1]));
++	&mov	(&DWP(8, at T[1]),$C);
++	&mov	($B, at T[0]);
++	&mov	(&DWP(12, at T[1]),$D);
++	&mov	(&DWP(16, at T[1]),$E);
 +
-+&external_label("OPENSSL_ia32cap_X") if ($xmm);
++	&jmp	(&label("loop"));
 +
++&set_label("done",16);		$j=$saved_j; @V=@saved_V;
 +
- $A="eax";
- $B="ebx";
- $C="ecx";
-@@ -47,6 +129,10 @@ $tmp1="ebp";
- 
- @V=($A,$B,$C,$D,$E,$T);
- 
-+$alt=0;	# 1 denotes alternative IALU implementation, which performs
-+	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
-+	# Sandy Bridge...
++	&Xtail_avx(\&body_20_39);
++	&Xtail_avx(\&body_20_39);
++	&Xtail_avx(\&body_20_39);
 +
- sub BODY_00_15
- 	{
- 	local($n,$a,$b,$c,$d,$e,$f)=@_;
-@@ -59,16 +145,18 @@ sub BODY_00_15
- 	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
- 	 &xor($f,$d);
- 	&add($tmp1,$e);			# tmp1+=e;
--	 &and($f,$b);
--	&mov($e,&swtmp($n%16));		# e becomes volatile and is loaded
-+	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
- 	 				# with xi, also note that e becomes
- 					# f in next round...
--	 &xor($f,$d);			# f holds F_00_19(b,c,d)
-+	&and($f,$b);
- 	&rotr($b,2);			# b=ROTATE(b,30)
--	 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
-+	 &xor($f,$d);			# f holds F_00_19(b,c,d)
-+	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
- 
--	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
-+	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
-+		      &add($f,$tmp1); }	# f+=tmp1
- 	else        { &add($tmp1,$f); }	# f becomes a in next round
-+	&mov($tmp1,$a)			if ($alt && $n==15);
- 	}
- 
- sub BODY_16_19
-@@ -77,22 +165,41 @@ sub BODY_16_19
- 
- 	&comment("16_19 $n");
- 
--	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
--	 &mov($tmp1,$c);		# tmp1 to hold F_00_19(b,c,d)
--	&xor($f,&swtmp(($n+2)%16));
--	 &xor($tmp1,$d);
--	&xor($f,&swtmp(($n+8)%16));
--	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
--	&rotr($b,2);			# b=ROTATE(b,30)
-+if ($alt) {
-+	&xor($c,$d);
-+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-+	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
-+	 &xor($f,&swtmp(($n+8)%16));
-+	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
-+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
-+	&rotl($f,1);			# f=ROTATE(f,1)
-+	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
-+	&xor($c,$d);			# restore $c
-+	 &mov($tmp1,$a);		# b in next round
-+	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
-+	 &mov(&swtmp($n%16),$f);	# xi=f
-+	&rotl($a,5);			# ROTATE(a,5)
-+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
-+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
-+	 &add($f,$a);			# f+=ROTATE(a,5)
-+} else {
-+	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
-+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-+	&xor($tmp1,$d);
-+	 &xor($f,&swtmp(($n+8)%16));
-+	&and($tmp1,$b);
- 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
- 	&rotl($f,1);			# f=ROTATE(f,1)
- 	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
--	&mov(&swtmp($n%16),$f);		# xi=f
--	&lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
--	 &mov($e,$a);			# e becomes volatile
--	&rotl($e,5);			# e=ROTATE(a,5)
--	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
--	&add($f,$e);			# f+=ROTATE(a,5)
-+	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
-+	 &mov($tmp1,$a);
-+	&rotr($b,2);			# b=ROTATE(b,30)
-+	 &mov(&swtmp($n%16),$f);	# xi=f
-+	&rotl($tmp1,5);			# ROTATE(a,5)
-+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
-+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
-+	 &add($f,$tmp1);		# f+=ROTATE(a,5)
-+}
- 	}
- 
- sub BODY_20_39
-@@ -102,21 +209,41 @@ sub BODY_20_39
- 
- 	&comment("20_39 $n");
- 
-+if ($alt) {
-+	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
-+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-+	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
-+	 &xor($f,&swtmp(($n+8)%16));
-+	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
-+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
-+	&rotl($f,1);			# f=ROTATE(f,1)
-+	 &mov($tmp1,$a);		# b in next round
-+	&rotr($b,7);			# b=ROTATE(b,30)
-+	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
-+	&rotl($a,5);			# ROTATE(a,5)
-+	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
-+	&and($tmp1,$b)			if($n==39);
-+	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
-+	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
-+	 &add($f,$a);			# f+=ROTATE(a,5)
-+	&rotr($a,5)			if ($n==79);
-+} else {
- 	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
--	 &mov($f,&swtmp($n%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
--	&rotr($b,2);			# b=ROTATE(b,30)
--	 &xor($f,&swtmp(($n+2)%16));
-+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
- 	&xor($tmp1,$c);
- 	 &xor($f,&swtmp(($n+8)%16));
- 	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
- 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
- 	&rotl($f,1);			# f=ROTATE(f,1)
--	 &add($tmp1,$e);
--	&mov(&swtmp($n%16),$f);		# xi=f
--	 &mov($e,$a);			# e becomes volatile
--	&rotl($e,5);			# e=ROTATE(a,5)
--	 &lea($f,&DWP($K,$f,$tmp1));	# f+=K_20_39+e
--	&add($f,$e);			# f+=ROTATE(a,5)
-+	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
-+	&rotr($b,2);			# b=ROTATE(b,30)
-+	 &mov($tmp1,$a);
-+	&rotl($tmp1,5);			# ROTATE(a,5)
-+	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
-+	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
-+	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
-+	&add($f,$tmp1);			# f+=ROTATE(a,5)
-+}
- 	}
- 
- sub BODY_40_59
-@@ -125,41 +252,86 @@ sub BODY_40_59
- 
- 	&comment("40_59 $n");
- 
--	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
--	 &mov($tmp1,&swtmp(($n+2)%16));
--	&xor($f,$tmp1);
--	 &mov($tmp1,&swtmp(($n+8)%16));
--	&xor($f,$tmp1);
--	 &mov($tmp1,&swtmp(($n+13)%16));
--	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
--	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
-+if ($alt) {
-+	&add($e,$tmp1);			# e+=b&(c^d)
-+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-+	&mov($tmp1,$d);
-+	 &xor($f,&swtmp(($n+8)%16));
-+	&xor($c,$d);			# restore $c
-+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
- 	&rotl($f,1);			# f=ROTATE(f,1)
--	 &or($tmp1,$c);
--	&mov(&swtmp($n%16),$f);		# xi=f
--	 &and($tmp1,$d);
--	&lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
--	 &mov($e,$b);			# e becomes volatile and is used
--					# to calculate F_40_59(b,c,d)
-+	 &and($tmp1,$c);
-+	&rotr($b,7);			# b=ROTATE(b,30)
-+	 &add($e,$tmp1);		# e+=c&d
-+	&mov($tmp1,$a);			# b in next round
-+	 &mov(&swtmp($n%16),$f);	# xi=f
-+	&rotl($a,5);			# ROTATE(a,5)
-+	 &xor($b,$c)			if ($n<59);
-+	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
-+	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
-+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
-+	 &add($f,$a);			# f+=ROTATE(a,5)
-+} else {
-+	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
-+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-+	&xor($tmp1,$d);
-+	 &xor($f,&swtmp(($n+8)%16));
-+	&and($tmp1,$b);
-+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
-+	&rotl($f,1);			# f=ROTATE(f,1)
-+	 &add($tmp1,$e);		# b&(c^d)+=e
- 	&rotr($b,2);			# b=ROTATE(b,30)
--	 &and($e,$c);
--	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)		
--	 &mov($e,$a);
--	&rotl($e,5);			# e=ROTATE(a,5)
--	 &add($f,$tmp1);		# f+=tmp1;
-+	 &mov($e,$a);			# e becomes volatile
-+	&rotl($e,5);			# ROTATE(a,5)
-+	 &mov(&swtmp($n%16),$f);	# xi=f
-+	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
-+	 &mov($tmp1,$c);
- 	&add($f,$e);			# f+=ROTATE(a,5)
-+	 &and($tmp1,$d);
-+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
-+	 &add($f,$tmp1);		# f+=c&d
++	&vzeroall();
++
++	&mov	(@T[1],&DWP(192,"esp"));	# update context
++	&add	($A,&DWP(0, at T[1]));
++	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
++	&add	(@T[0],&DWP(4, at T[1]));		# $b
++	&add	($C,&DWP(8, at T[1]));
++	&mov	(&DWP(0, at T[1]),$A);
++	&add	($D,&DWP(12, at T[1]));
++	&mov	(&DWP(4, at T[1]), at T[0]);
++	&add	($E,&DWP(16, at T[1]));
++	&mov	(&DWP(8, at T[1]),$C);
++	&mov	(&DWP(12, at T[1]),$D);
++	&mov	(&DWP(16, at T[1]),$E);
++&function_end("_sha1_block_data_order_avx");
 +}
- 	}
++&set_label("K_XX_XX",64);
++&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
++&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
++&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
++&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
++&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
++}
+ &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
  
- &function_begin("sha1_block_data_order");
-+if ($xmm) {
-+  &static_label("ssse3_shortcut");
-+  &static_label("avx_shortcut")		if ($ymm);
-+  &static_label("K_XX_XX");
+ &asm_finish();
+diff -up openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl.intelopts openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl
+--- openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl	2013-02-19 21:19:43.923583195 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+ #
+ # ====================================================================
+ # Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
+@@ -16,7 +16,7 @@
+ # There was suggestion to mechanically translate 32-bit code, but I
+ # dismissed it, reasoning that x86_64 offers enough register bank
+ # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+-# implementation:-) However! While 64-bit code does performs better
++# implementation:-) However! While 64-bit code does perform better
+ # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+ # x86_64 does offer larger *addressable* bank, but out-of-order core
+ # reaches for even more registers through dynamic aliasing, and EM64T
+@@ -29,6 +29,38 @@
+ # Xeon P4	+65%		+0%		9.9
+ # Core2		+60%		+10%		7.0
+ 
++# August 2009.
++#
++# The code was revised to minimize code size and to maximize
++# "distance" between instructions producing input to 'lea'
++# instruction and the 'lea' instruction itself, which is essential
++# for Intel Atom core.
 +
-+	&call	(&label("pic_point"));	# make it PIC!
-+  &set_label("pic_point");
-+	&blindpop($tmp1);
-+	&picmeup($T,"OPENSSL_ia32cap_X",$tmp1,&label("pic_point"));
-+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
++# October 2010.
++#
++# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
++# is to offload message schedule denoted by Wt in NIST specification,
++# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
++# for background and implementation details. The only difference from
++# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
++# to free temporary registers.
 +
-+	&mov	($A,&DWP(0,$T));
-+	&mov	($D,&DWP(4,$T));
-+	&test	($D,1<<9);		# check SSSE3 bit
-+	&jz	(&label("x86"));
-+	&test	($A,1<<24);		# check FXSR bit
-+	&jz	(&label("x86"));
-+	if ($ymm) {
-+		&and	($D,1<<28);		# mask AVX bit
-+		&and	($A,1<<30);		# mask "Intel CPU" bit
-+		&or	($A,$D);
-+		&cmp	($A,1<<28|1<<30);
-+		&je	(&label("avx_shortcut"));
-+	}
-+	&jmp	(&label("ssse3_shortcut"));
-+  &set_label("x86",16);
-+}
- 	&mov($tmp1,&wparam(0));	# SHA_CTX *c
- 	&mov($T,&wparam(1));	# const void *input
- 	&mov($A,&wparam(2));	# size_t num
--	&stack_push(16);	# allocate X[16]
-+	&stack_push(16+3);	# allocate X[16]
- 	&shl($A,6);
- 	&add($A,$T);
- 	&mov(&wparam(2),$A);	# pointer beyond the end of input
- 	&mov($E,&DWP(16,$tmp1));# pre-load E
-+	&jmp(&label("loop"));
++# April 2011.
++#
++# Add AVX code path. See sha1-586.pl for further information.
++
++######################################################################
++# Current performance is summarized in following table. Numbers are
++# CPU clock cycles spent to process single byte (less is better).
++#
++#		x86_64		SSSE3		AVX
++# P4		9.8		-
++# Opteron	6.6		-
++# Core2		6.7		6.1/+10%	-
++# Atom		11.0		9.7/+13%	-
++# Westmere	7.1		5.6/+27%	-
++# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
++
+ $flavour = shift;
+ $output  = shift;
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+@@ -40,6 +72,13 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+ die "can't locate x86_64-xlate.pl";
+ 
++$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
++		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
++	   $1>=2.19);
++$avx=1 if (!$avx && $flavour =~ /nasm/ &&
++	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
++	   $1>=2.03);
++
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+ 
+ $ctx="%rdi";	# 1st arg
+@@ -51,196 +90,994 @@ $ctx="%r8";
+ $inp="%r9";
+ $num="%r10";
+ 
+-$xi="%eax";
+-$t0="%ebx";
+-$t1="%ecx";
+-$A="%edx";
+-$B="%esi";
+-$C="%edi";
+-$D="%ebp";
+-$E="%r11d";
+-$T="%r12d";
+-
+- at V=($A,$B,$C,$D,$E,$T);
++$t0="%eax";
++$t1="%ebx";
++$t2="%ecx";
++ at xi=("%edx","%ebp");
++$A="%esi";
++$B="%edi";
++$C="%r11d";
++$D="%r12d";
++$E="%r13d";
+ 
+-sub PROLOGUE {
+-my $func=shift;
+-$code.=<<___;
+-.globl	$func
+-.type	$func,\@function,3
+-.align	16
+-$func:
+-	push	%rbx
+-	push	%rbp
+-	push	%r12
+-	mov	%rsp,%r11
+-	mov	%rdi,$ctx	# reassigned argument
+-	sub	\$`8+16*4`,%rsp
+-	mov	%rsi,$inp	# reassigned argument
+-	and	\$-64,%rsp
+-	mov	%rdx,$num	# reassigned argument
+-	mov	%r11,`16*4`(%rsp)
+-.Lprologue:
+-
+-	mov	0($ctx),$A
+-	mov	4($ctx),$B
+-	mov	8($ctx),$C
+-	mov	12($ctx),$D
+-	mov	16($ctx),$E
+-___
+-}
+-
+-sub EPILOGUE {
+-my $func=shift;
+-$code.=<<___;
+-	mov	`16*4`(%rsp),%rsi
+-	mov	(%rsi),%r12
+-	mov	8(%rsi),%rbp
+-	mov	16(%rsi),%rbx
+-	lea	24(%rsi),%rsp
+-.Lepilogue:
+-	ret
+-.size	$func,.-$func
+-___
+-}
++ at V=($A,$B,$C,$D,$E);
+ 
+ sub BODY_00_19 {
+-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
++my ($i,$a,$b,$c,$d,$e)=@_;
+ my $j=$i+1;
+ $code.=<<___ if ($i==0);
+-	mov	`4*$i`($inp),$xi	
+-	`"bswap	$xi"	if(!defined($host))`
+-	mov	$xi,`4*$i`(%rsp)
++	mov	`4*$i`($inp),$xi[0]
++	bswap	$xi[0]
++	mov	$xi[0],`4*$i`(%rsp)
+ ___
+ $code.=<<___ if ($i<15);
+-	lea	0x5a827999($xi,$e),$f
+ 	mov	$c,$t0
+-	mov	`4*$j`($inp),$xi
+-	mov	$a,$e
++	mov	`4*$j`($inp),$xi[1]
++	mov	$a,$t2
+ 	xor	$d,$t0
+-	`"bswap	$xi"	if(!defined($host))`	
+-	rol	\$5,$e
++	bswap	$xi[1]
++	rol	\$5,$t2
++	lea	0x5a827999($xi[0],$e),$e
+ 	and	$b,$t0
+-	mov	$xi,`4*$j`(%rsp)
+-	add	$e,$f
++	mov	$xi[1],`4*$j`(%rsp)
++	add	$t2,$e
+ 	xor	$d,$t0
+ 	rol	\$30,$b
+-	add	$t0,$f
++	add	$t0,$e
+ ___
+ $code.=<<___ if ($i>=15);
+-	lea	0x5a827999($xi,$e),$f
+-	mov	`4*($j%16)`(%rsp),$xi
++	mov	`4*($j%16)`(%rsp),$xi[1]
+ 	mov	$c,$t0
+-	mov	$a,$e
+-	xor	`4*(($j+2)%16)`(%rsp),$xi
++	mov	$a,$t2
++	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+ 	xor	$d,$t0
+-	rol	\$5,$e
+-	xor	`4*(($j+8)%16)`(%rsp),$xi
++	rol	\$5,$t2
++	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+ 	and	$b,$t0
+-	add	$e,$f
+-	xor	`4*(($j+13)%16)`(%rsp),$xi
++	lea	0x5a827999($xi[0],$e),$e
++	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+ 	xor	$d,$t0
++	rol	\$1,$xi[1]
++	add	$t2,$e
+ 	rol	\$30,$b
+-	add	$t0,$f
+-	rol	\$1,$xi
+-	mov	$xi,`4*($j%16)`(%rsp)
++	mov	$xi[1],`4*($j%16)`(%rsp)
++	add	$t0,$e
+ ___
++unshift(@xi,pop(@xi));
+ }
  
--	&set_label("loop",16);
-+&set_label("loop",16);
+ sub BODY_20_39 {
+-my ($i,$a,$b,$c,$d,$e,$f)=@_;
++my ($i,$a,$b,$c,$d,$e)=@_;
+ my $j=$i+1;
+ my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
+ $code.=<<___ if ($i<79);
+-	lea	$K($xi,$e),$f
+-	mov	`4*($j%16)`(%rsp),$xi
++	mov	`4*($j%16)`(%rsp),$xi[1]
+ 	mov	$c,$t0
+-	mov	$a,$e
+-	xor	`4*(($j+2)%16)`(%rsp),$xi
++	mov	$a,$t2
++	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+ 	xor	$b,$t0
+-	rol	\$5,$e
+-	xor	`4*(($j+8)%16)`(%rsp),$xi
++	rol	\$5,$t2
++	lea	$K($xi[0],$e),$e
++	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+ 	xor	$d,$t0
+-	add	$e,$f
+-	xor	`4*(($j+13)%16)`(%rsp),$xi
++	add	$t2,$e
++	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+ 	rol	\$30,$b
+-	add	$t0,$f
+-	rol	\$1,$xi
++	add	$t0,$e
++	rol	\$1,$xi[1]
+ ___
+ $code.=<<___ if ($i<76);
+-	mov	$xi,`4*($j%16)`(%rsp)
++	mov	$xi[1],`4*($j%16)`(%rsp)
+ ___
+ $code.=<<___ if ($i==79);
+-	lea	$K($xi,$e),$f
+ 	mov	$c,$t0
+-	mov	$a,$e
++	mov	$a,$t2
+ 	xor	$b,$t0
+-	rol	\$5,$e
++	lea	$K($xi[0],$e),$e
++	rol	\$5,$t2
+ 	xor	$d,$t0
+-	add	$e,$f
++	add	$t2,$e
+ 	rol	\$30,$b
+-	add	$t0,$f
++	add	$t0,$e
+ ___
++unshift(@xi,pop(@xi));
+ }
  
- 	# copy input chunk to X, but reversing byte order!
- 	for ($i=0; $i<16; $i+=4)
-@@ -213,8 +385,845 @@ sub BODY_40_59
- 	&mov(&DWP(16,$tmp1),$C);
- 	&jb(&label("loop"));
+ sub BODY_40_59 {
+-my ($i,$a,$b,$c,$d,$e,$f)=@_;
++my ($i,$a,$b,$c,$d,$e)=@_;
+ my $j=$i+1;
+ $code.=<<___;
+-	lea	0x8f1bbcdc($xi,$e),$f
+-	mov	`4*($j%16)`(%rsp),$xi
+-	mov	$b,$t0
+-	mov	$b,$t1
+-	xor	`4*(($j+2)%16)`(%rsp),$xi
+-	mov	$a,$e
+-	and	$c,$t0
+-	xor	`4*(($j+8)%16)`(%rsp),$xi
+-	or	$c,$t1
+-	rol	\$5,$e
+-	xor	`4*(($j+13)%16)`(%rsp),$xi
+-	and	$d,$t1
+-	add	$e,$f
+-	rol	\$1,$xi
+-	or	$t1,$t0
++	mov	`4*($j%16)`(%rsp),$xi[1]
++	mov	$c,$t0
++	mov	$c,$t1
++	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
++	and	$d,$t0
++	mov	$a,$t2
++	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
++	xor	$d,$t1
++	lea	0x8f1bbcdc($xi[0],$e),$e
++	rol	\$5,$t2
++	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
++	add	$t0,$e
++	and	$b,$t1
++	rol	\$1,$xi[1]
++	add	$t1,$e
+ 	rol	\$30,$b
+-	mov	$xi,`4*($j%16)`(%rsp)
+-	add	$t0,$f
++	mov	$xi[1],`4*($j%16)`(%rsp)
++	add	$t2,$e
+ ___
++unshift(@xi,pop(@xi));
+ }
  
--	&stack_pop(16);
-+	&stack_pop(16+3);
- &function_end("sha1_block_data_order");
+-$code=".text\n";
++$code.=<<___;
++.text
++.extern	OPENSSL_ia32cap_X
 +
-+if ($xmm) {
-+######################################################################
-+# The SSSE3 implementation.
-+#
-+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
-+# 32 elements of the message schedule or Xupdate outputs. First 4
-+# quadruples are simply byte-swapped input, next 4 are calculated
-+# according to method originally suggested by Dean Gaudet (modulo
-+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
-+# collected, it switches to routine proposed by Max Locktyukhin.
-+#
-+# Calculations inevitably require temporary reqisters, and there are
-+# no %xmm registers left to spare. For this reason part of the ring
-+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
-+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
-+# X[-5], and X[4] - X[-4]...
-+#
-+# Another notable optimization is aggressive stack frame compression
-+# aiming to minimize amount of 9-byte instructions...
-+#
-+# Yet another notable optimization is "jumping" $B variable. It means
-+# that there is no register permanently allocated for $B value. This
-+# allowed to eliminate one instruction from body_20_39...
-+#
-+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
-+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
-+my @V=($A,$B,$C,$D,$E);
-+my $j=0;			# hash round
-+my @T=($T,$tmp1);
-+my $inp;
++.globl	sha1_block_data_order
++.type	sha1_block_data_order,\@function,3
++.align	16
++sha1_block_data_order:
++	mov	OPENSSL_ia32cap_X+0(%rip),%r9d
++	mov	OPENSSL_ia32cap_X+4(%rip),%r8d
++	test	\$`1<<9`,%r8d		# check SSSE3 bit
++	jz	.Lialu
++___
++$code.=<<___ if ($avx);
++	and	\$`1<<28`,%r8d		# mask AVX bit
++	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
++	or	%r9d,%r8d
++	cmp	\$`1<<28|1<<30`,%r8d
++	je	_avx_shortcut
++___
++$code.=<<___;
++	jmp	_ssse3_shortcut
 +
-+my $_rol=sub { &rol(@_) };
-+my $_ror=sub { &ror(@_) };
++.align	16
++.Lialu:
++	push	%rbx
++	push	%rbp
++	push	%r12
++	push	%r13
++	mov	%rsp,%r11
++	mov	%rdi,$ctx	# reassigned argument
++	sub	\$`8+16*4`,%rsp
++	mov	%rsi,$inp	# reassigned argument
++	and	\$-64,%rsp
++	mov	%rdx,$num	# reassigned argument
++	mov	%r11,`16*4`(%rsp)
++.Lprologue:
 +
-+&function_begin("_sha1_block_data_order_ssse3");
-+	&call	(&label("pic_point"));	# make it PIC!
-+	&set_label("pic_point");
-+	&blindpop($tmp1);
-+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-+&set_label("ssse3_shortcut");
++	mov	0($ctx),$A
++	mov	4($ctx),$B
++	mov	8($ctx),$C
++	mov	12($ctx),$D
++	mov	16($ctx),$E
++	jmp	.Lloop
+ 
+-&PROLOGUE("sha1_block_data_order");
+-$code.=".align	4\n.Lloop:\n";
++.align	16
++.Lloop:
++___
+ for($i=0;$i<20;$i++)	{ &BODY_00_19($i, at V); unshift(@V,pop(@V)); }
+ for(;$i<40;$i++)	{ &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+ for(;$i<60;$i++)	{ &BODY_40_59($i, at V); unshift(@V,pop(@V)); }
+ for(;$i<80;$i++)	{ &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+ $code.=<<___;
+-	add	0($ctx),$E
+-	add	4($ctx),$T
+-	add	8($ctx),$A
+-	add	12($ctx),$B
+-	add	16($ctx),$C
+-	mov	$E,0($ctx)
+-	mov	$T,4($ctx)
+-	mov	$A,8($ctx)
+-	mov	$B,12($ctx)
+-	mov	$C,16($ctx)
+-
+-	xchg	$E,$A	# mov	$E,$A
+-	xchg	$T,$B	# mov	$T,$B
+-	xchg	$E,$C	# mov	$A,$C
+-	xchg	$T,$D	# mov	$B,$D
+-			# mov	$C,$E
+-	lea	`16*4`($inp),$inp
++	add	0($ctx),$A
++	add	4($ctx),$B
++	add	8($ctx),$C
++	add	12($ctx),$D
++	add	16($ctx),$E
++	mov	$A,0($ctx)
++	mov	$B,4($ctx)
++	mov	$C,8($ctx)
++	mov	$D,12($ctx)
++	mov	$E,16($ctx)
 +
-+	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
-+	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
-+	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
-+	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
-+	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
+ 	sub	\$1,$num
++	lea	`16*4`($inp),$inp
+ 	jnz	.Lloop
++
++	mov	`16*4`(%rsp),%rsi
++	mov	(%rsi),%r13
++	mov	8(%rsi),%r12
++	mov	16(%rsi),%rbp
++	mov	24(%rsi),%rbx
++	lea	32(%rsi),%rsp
++.Lepilogue:
++	ret
++.size	sha1_block_data_order,.-sha1_block_data_order
+ ___
+-&EPILOGUE("sha1_block_data_order");
++{{{
++my $Xi=4;
++my @X=map("%xmm$_",(4..7,0..3));
++my @Tx=map("%xmm$_",(8..10));
++my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
++my @T=("%esi","%edi");
++my $j=0;
++my $K_XX_XX="%r11";
++
++my $_rol=sub { &rol(@_) };
++my $_ror=sub { &ror(@_) };
 +
-+	&mov	($E,&wparam(0));		# load argument block
-+	&mov	($inp=@T[1],&wparam(1));
-+	&mov	($D,&wparam(2));
-+	&mov	(@T[0],"esp");
+ $code.=<<___;
+-.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.type	sha1_block_data_order_ssse3,\@function,3
+ .align	16
++sha1_block_data_order_ssse3:
++_ssse3_shortcut:
++	push	%rbx
++	push	%rbp
++	push	%r12
++	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
++___
++$code.=<<___ if ($win64);
++	movaps	%xmm6,64+0(%rsp)
++	movaps	%xmm7,64+16(%rsp)
++	movaps	%xmm8,64+32(%rsp)
++	movaps	%xmm9,64+48(%rsp)
++	movaps	%xmm10,64+64(%rsp)
++.Lprologue_ssse3:
++___
++$code.=<<___;
++	mov	%rdi,$ctx	# reassigned argument
++	mov	%rsi,$inp	# reassigned argument
++	mov	%rdx,$num	# reassigned argument
 +
-+	# stack frame layout
-+	#
-+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
-+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
-+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
-+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
-+	#
-+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
-+	#	X[4]	X[5]	X[6]	X[7]
-+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
-+	#
-+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
-+	#	K_40_59	K_40_59	K_40_59	K_40_59
-+	#	K_60_79	K_60_79	K_60_79	K_60_79
-+	#	K_00_19	K_00_19	K_00_19	K_00_19
-+	#	pbswap mask
-+	#
-+	# +192	ctx				# argument block
-+	# +196	inp
-+	# +200	end
-+	# +204	esp
-+	&sub	("esp",208);
-+	&and	("esp",-64);
++	shl	\$6,$num
++	add	$inp,$num
++	lea	K_XX_XX(%rip),$K_XX_XX
 +
-+	&movdqa	(&QWP(112+0,"esp"), at X[4]);	# copy constants
-+	&movdqa	(&QWP(112+16,"esp"), at X[5]);
-+	&movdqa	(&QWP(112+32,"esp"), at X[6]);
-+	&shl	($D,6);				# len*64
-+	&movdqa	(&QWP(112+48,"esp"), at X[3]);
-+	&add	($D,$inp);			# end of input
-+	&movdqa	(&QWP(112+64,"esp"), at X[2]);
-+	&add	($inp,64);
-+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
-+	&mov	(&DWP(192+4,"esp"),$inp);
-+	&mov	(&DWP(192+8,"esp"),$D);
-+	&mov	(&DWP(192+12,"esp"), at T[0]);	# save original %esp
++	mov	0($ctx),$A		# load context
++	mov	4($ctx),$B
++	mov	8($ctx),$C
++	mov	12($ctx),$D
++	mov	$B, at T[0]		# magic seed
++	mov	16($ctx),$E
 +
-+	&mov	($A,&DWP(0,$E));		# load context
-+	&mov	($B,&DWP(4,$E));
-+	&mov	($C,&DWP(8,$E));
-+	&mov	($D,&DWP(12,$E));
-+	&mov	($E,&DWP(16,$E));
-+	&mov	(@T[0],$B);			# magic seed
++	movdqa	64($K_XX_XX), at X[2]	# pbswap mask
++	movdqa	0($K_XX_XX), at Tx[1]	# K_00_19
++	movdqu	0($inp), at X[-4&7]	# load input to %xmm[0-3]
++	movdqu	16($inp), at X[-3&7]
++	movdqu	32($inp), at X[-2&7]
++	movdqu	48($inp), at X[-1&7]
++	pshufb	@X[2], at X[-4&7]		# byte swap
++	add	\$64,$inp
++	pshufb	@X[2], at X[-3&7]
++	pshufb	@X[2], at X[-2&7]
++	pshufb	@X[2], at X[-1&7]
++	paddd	@Tx[1], at X[-4&7]		# add K_00_19
++	paddd	@Tx[1], at X[-3&7]
++	paddd	@Tx[1], at X[-2&7]
++	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
++	psubd	@Tx[1], at X[-4&7]		# restore X[]
++	movdqa	@X[-3&7],16(%rsp)
++	psubd	@Tx[1], at X[-3&7]
++	movdqa	@X[-2&7],32(%rsp)
++	psubd	@Tx[1], at X[-2&7]
++	jmp	.Loop_ssse3
++___
 +
-+	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
-+	&movdqu	(@X[-3&7],&QWP(-48,$inp));
-+	&movdqu	(@X[-2&7],&QWP(-32,$inp));
-+	&movdqu	(@X[-1&7],&QWP(-16,$inp));
-+	&pshufb	(@X[-4&7], at X[2]);		# byte swap
-+	&pshufb	(@X[-3&7], at X[2]);
-+	&pshufb	(@X[-2&7], at X[2]);
-+	&movdqa	(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
-+	&pshufb	(@X[-1&7], at X[2]);
-+	&paddd	(@X[-4&7], at X[3]);		# add K_00_19
-+	&paddd	(@X[-3&7], at X[3]);
-+	&paddd	(@X[-2&7], at X[3]);
-+	&movdqa	(&QWP(0,"esp"), at X[-4&7]);	# X[]+K xfer to IALU
-+	&psubd	(@X[-4&7], at X[3]);		# restore X[]
-+	&movdqa	(&QWP(0+16,"esp"), at X[-3&7]);
-+	&psubd	(@X[-3&7], at X[3]);
-+	&movdqa	(&QWP(0+32,"esp"), at X[-2&7]);
-+	&psubd	(@X[-2&7], at X[3]);
-+	&movdqa	(@X[0], at X[-3&7]);
-+	&jmp	(&label("loop"));
++sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
++{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
++  my $arg = pop;
++    $arg = "\$$arg" if ($arg*1 eq $arg);
++    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
++}
 +
-+######################################################################
-+# SSE instruction sequence is first broken to groups of indepentent
-+# instructions, independent in respect to their inputs and shifter
-+# (not all architectures have more than one). Then IALU instructions
-+# are "knitted in" between the SSE groups. Distance is maintained for
-+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
-+# [which allegedly also implements SSSE3]...
-+#
-+# Temporary registers usage. X[2] is volatile at the entry and at the
-+# end is restored from backtrace ring buffer. X[3] is expected to
-+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
-+# from previous round, it becomes volatile the moment the value is
-+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
-+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
-+# end it is loaded with next K_XX_XX [which becomes X[3] in next
-+# round]...
-+#
 +sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
 +{ use integer;
 +  my $body = shift;
 +  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
 +  my ($a,$b,$c,$d,$e);
 +
++	&movdqa	(@X[0], at X[-3&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++	&movdqa	(@Tx[0], at X[-1&7]);
 +	&palignr(@X[0], at X[-4&7],8);	# compose "X[-14]" in "X[0]"
-+	&movdqa	(@X[2], at X[-1&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	  &paddd	(@X[3], at X[-1&7]);
-+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
++	  &paddd	(@Tx[1], at X[-1&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
++	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&pxor	(@X[0], at X[-4&7]);	# "X[0]"^="X[-16]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&pxor	(@X[2], at X[-2&7]);	# "X[-3]"^"X[-8]"
++	&pxor	(@Tx[0], at X[-2&7]);	# "X[-3]"^"X[-8]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&pxor	(@X[0], at X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
++	&pxor	(@X[0], at Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
++	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&movdqa	(@X[4], at X[0]);
-+	&movdqa	(@X[2], at X[0]);
++	&movdqa	(@Tx[2], at X[0]);
++	&movdqa	(@Tx[0], at X[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
++	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
 +	&paddd	(@X[0], at X[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&psrld	(@X[2],31);
++	&psrld	(@Tx[0],31);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&movdqa	(@X[3], at X[4]);
++	&movdqa	(@Tx[1], at Tx[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&psrld	(@X[4],30);
-+	&por	(@X[0], at X[2]);		# "X[0]"<<<=1
++	&psrld	(@Tx[2],30);
++	&por	(@X[0], at Tx[0]);		# "X[0]"<<<=1
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&pslld	(@X[3],2);
-+	&pxor	(@X[0], at X[4]);
++	&pslld	(@Tx[1],2);
++	&pxor	(@X[0], at Tx[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
++	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&pxor	(@X[0], at X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
-+	  &movdqa	(@X[1], at X[-2&7])	if ($Xi<7);
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
++	&pxor	(@X[0], at Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
 +
 +	 foreach (@insns) { eval; }	# remaining instructions [if any]
 +
 +  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
++		push(@Tx,shift(@Tx));
 +}
 +
 +sub Xupdate_ssse3_32_79()
@@ -5355,35 +5220,34 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
 +  my ($a,$b,$c,$d,$e);
 +
-+	&movdqa	(@X[2], at X[-1&7])	if ($Xi==8);
++	&movdqa	(@Tx[0], at X[-1&7])	if ($Xi==8);
 +	 eval(shift(@insns));		# body_20_39
 +	&pxor	(@X[0], at X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
-+	&palignr(@X[2], at X[-2&7],8);	# compose "X[-6]"
++	&palignr(@Tx[0], at X[-2&7],8);	# compose "X[-6]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# rol
 +
 +	&pxor	(@X[0], at X[-7&7]);	# "X[0]"^="X[-28]"
-+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);	# save X[] to backtrace buffer
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	 if ($Xi%5) {
-+	  &movdqa	(@X[4], at X[3]);	# "perpetuate" K_XX_XX...
-+	 } else {			# ... or load next one
-+	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
-+	 }
-+	  &paddd	(@X[3], at X[-1&7]);
++	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
++	if ($Xi%5) {
++	  &movdqa	(@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
++	} else {			# ... or load next one
++	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
++	}
++	  &paddd	(@Tx[1], at X[-1&7]);
 +	 eval(shift(@insns));		# ror
 +	 eval(shift(@insns));
 +
-+	&pxor	(@X[0], at X[2]);		# "X[0]"^="X[-6]"
++	&pxor	(@X[0], at Tx[0]);		# "X[0]"^="X[-6]"
 +	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# rol
 +
-+	&movdqa	(@X[2], at X[0]);
-+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
++	&movdqa	(@Tx[0], at X[0]);
++	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# ror
@@ -5392,7 +5256,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	&pslld	(@X[0],2);
 +	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
-+	&psrld	(@X[2],30);
++	&psrld	(@Tx[0],30);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
@@ -5400,21 +5264,21 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	 eval(shift(@insns));		# ror
 +	 eval(shift(@insns));
 +
-+	&por	(@X[0], at X[2]);		# "X[0]"<<<=2
++	&por	(@X[0], at Tx[0]);		# "X[0]"<<<=2
 +	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
-+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
++	  &movdqa	(@Tx[1], at X[0])	if ($Xi<19);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# ror
-+	  &movdqa	(@X[3], at X[0])	if ($Xi<19);
++	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
 +
 +	 foreach (@insns) { eval; }	# remaining instructions
 +
 +  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
++		push(@Tx,shift(@Tx));
 +}
 +
 +sub Xuplast_ssse3_80()
@@ -5424,30 +5288,29 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +  my ($a,$b,$c,$d,$e);
 +
 +	 eval(shift(@insns));
-+	  &paddd	(@X[3], at X[-1&7]);
++	  &paddd	(@Tx[1], at X[-1&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer IALU
++	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer IALU
 +
 +	 foreach (@insns) { eval; }		# remaining instructions
 +
-+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
-+	&cmp	($inp,&DWP(192+8,"esp"));
-+	&je	(&label("done"));
++	&cmp	($inp,$num);
++	&je	(".Ldone_ssse3");
 +
-+	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
-+	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
-+	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
-+	&movdqu	(@X[-3&7],&QWP(16,$inp));
-+	&movdqu	(@X[-2&7],&QWP(32,$inp));
-+	&movdqu	(@X[-1&7],&QWP(48,$inp));
-+	&add	($inp,64);
++	unshift(@Tx,pop(@Tx));
++
++	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
++	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
++	&movdqu	(@X[-4&7],"0($inp)");		# load input
++	&movdqu	(@X[-3&7],"16($inp)");
++	&movdqu	(@X[-2&7],"32($inp)");
++	&movdqu	(@X[-1&7],"48($inp)");
 +	&pshufb	(@X[-4&7], at X[2]);		# byte swap
-+	&mov	(&DWP(192+4,"esp"),$inp);
-+	&movdqa	(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
++	&add	($inp,64);
 +
 +  $Xi=0;
 +}
@@ -5463,15 +5326,15 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	&pshufb	(@X[($Xi-3)&7], at X[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&paddd	(@X[($Xi-4)&7], at X[3]);
++	&paddd	(@X[($Xi-4)&7], at Tx[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&movdqa	(&QWP(0+16*$Xi,"esp"), at X[($Xi-4)&7]);	# X[]+K xfer to IALU
++	&movdqa	(eval(16*$Xi)."(%rsp)", at X[($Xi-4)&7]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&psubd	(@X[($Xi-4)&7], at X[3]);
++	&psubd	(@X[($Xi-4)&7], at Tx[1]);
 +
 +	foreach (@insns) { eval; }
 +  $Xi++;
@@ -5489,7 +5352,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +sub body_00_19 () {
 +	(
 +	'($a,$b,$c,$d,$e)=@V;'.
-+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
++	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
 +	'&xor	($c,$d);',
 +	'&mov	(@T[1],$a);',	# $b in next round
 +	'&$_rol	($a,5);',
@@ -5505,7 +5368,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +sub body_20_39 () {
 +	(
 +	'($a,$b,$c,$d,$e)=@V;'.
-+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
++	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
 +	'&xor	(@T[0],$d);',	# ($b^$d)
 +	'&mov	(@T[1],$a);',	# $b in next round
 +	'&$_rol	($a,5);',
@@ -5521,7 +5384,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	'($a,$b,$c,$d,$e)=@V;'.
 +	'&mov	(@T[1],$c);',
 +	'&xor	($c,$d);',
-+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
++	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
 +	'&and	(@T[1],$d);',
 +	'&and	(@T[0],$c);',	# ($b&($c^$d))
 +	'&$_ror	($b,7);',	# $b>>>2
@@ -5533,8 +5396,10 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 +	);
 +}
-+
-+&set_label("loop",16);
++$code.=<<___;
++.align	16
++.Loop_ssse3:
++___
 +	&Xupdate_ssse3_16_31(\&body_00_19);
 +	&Xupdate_ssse3_16_31(\&body_00_19);
 +	&Xupdate_ssse3_16_31(\&body_00_19);
@@ -5559,133 +5424,125 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	&Xloop_ssse3(\&body_20_39);
 +	&Xloop_ssse3(\&body_20_39);
 +
-+	&mov	(@T[1],&DWP(192,"esp"));	# update context
-+	&add	($A,&DWP(0, at T[1]));
-+	&add	(@T[0],&DWP(4, at T[1]));		# $b
-+	&add	($C,&DWP(8, at T[1]));
-+	&mov	(&DWP(0, at T[1]),$A);
-+	&add	($D,&DWP(12, at T[1]));
-+	&mov	(&DWP(4, at T[1]), at T[0]);
-+	&add	($E,&DWP(16, at T[1]));
-+	&mov	(&DWP(8, at T[1]),$C);
-+	&mov	($B, at T[0]);
-+	&mov	(&DWP(12, at T[1]),$D);
-+	&mov	(&DWP(16, at T[1]),$E);
-+	&movdqa	(@X[0], at X[-3&7]);
-+
-+	&jmp	(&label("loop"));
++$code.=<<___;
++	add	0($ctx),$A			# update context
++	add	4($ctx), at T[0]
++	add	8($ctx),$C
++	add	12($ctx),$D
++	mov	$A,0($ctx)
++	add	16($ctx),$E
++	mov	@T[0],4($ctx)
++	mov	@T[0],$B			# magic seed
++	mov	$C,8($ctx)
++	mov	$D,12($ctx)
++	mov	$E,16($ctx)
++	jmp	.Loop_ssse3
 +
-+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
++.align	16
++.Ldone_ssse3:
++___
++				$j=$saved_j; @V=@saved_V;
 +
 +	&Xtail_ssse3(\&body_20_39);
 +	&Xtail_ssse3(\&body_20_39);
 +	&Xtail_ssse3(\&body_20_39);
 +
-+	&mov	(@T[1],&DWP(192,"esp"));	# update context
-+	&add	($A,&DWP(0, at T[1]));
-+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
-+	&add	(@T[0],&DWP(4, at T[1]));		# $b
-+	&add	($C,&DWP(8, at T[1]));
-+	&mov	(&DWP(0, at T[1]),$A);
-+	&add	($D,&DWP(12, at T[1]));
-+	&mov	(&DWP(4, at T[1]), at T[0]);
-+	&add	($E,&DWP(16, at T[1]));
-+	&mov	(&DWP(8, at T[1]),$C);
-+	&mov	(&DWP(12, at T[1]),$D);
-+	&mov	(&DWP(16, at T[1]),$E);
-+
-+&function_end("_sha1_block_data_order_ssse3");
++$code.=<<___;
++	add	0($ctx),$A			# update context
++	add	4($ctx), at T[0]
++	add	8($ctx),$C
++	mov	$A,0($ctx)
++	add	12($ctx),$D
++	mov	@T[0],4($ctx)
++	add	16($ctx),$E
++	mov	$C,8($ctx)
++	mov	$D,12($ctx)
++	mov	$E,16($ctx)
++___
++$code.=<<___ if ($win64);
++	movaps	64+0(%rsp),%xmm6
++	movaps	64+16(%rsp),%xmm7
++	movaps	64+32(%rsp),%xmm8
++	movaps	64+48(%rsp),%xmm9
++	movaps	64+64(%rsp),%xmm10
++___
++$code.=<<___;
++	lea	`64+($win64?6*16:0)`(%rsp),%rsi
++	mov	0(%rsi),%r12
++	mov	8(%rsi),%rbp
++	mov	16(%rsi),%rbx
++	lea	24(%rsi),%rsp
++.Lepilogue_ssse3:
++	ret
++.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
++___
 +
-+if ($ymm) {
-+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
-+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
-+my @V=($A,$B,$C,$D,$E);
-+my $j=0;			# hash round
-+my @T=($T,$tmp1);
-+my $inp;
++if ($avx) {
++my $Xi=4;
++my @X=map("%xmm$_",(4..7,0..3));
++my @Tx=map("%xmm$_",(8..10));
++my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
++my @T=("%esi","%edi");
++my $j=0;
++my $K_XX_XX="%r11";
 +
 +my $_rol=sub { &shld(@_[0], at _) };
 +my $_ror=sub { &shrd(@_[0], at _) };
 +
-+&function_begin("_sha1_block_data_order_avx");
-+	&call	(&label("pic_point"));	# make it PIC!
-+	&set_label("pic_point");
-+	&blindpop($tmp1);
-+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-+&set_label("avx_shortcut");
-+	&vzeroall();
-+
-+	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
-+	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
-+	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
-+	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
-+	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
-+
-+	&mov	($E,&wparam(0));		# load argument block
-+	&mov	($inp=@T[1],&wparam(1));
-+	&mov	($D,&wparam(2));
-+	&mov	(@T[0],"esp");
-+
-+	# stack frame layout
-+	#
-+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
-+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
-+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
-+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
-+	#
-+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
-+	#	X[4]	X[5]	X[6]	X[7]
-+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
-+	#
-+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
-+	#	K_40_59	K_40_59	K_40_59	K_40_59
-+	#	K_60_79	K_60_79	K_60_79	K_60_79
-+	#	K_00_19	K_00_19	K_00_19	K_00_19
-+	#	pbswap mask
-+	#
-+	# +192	ctx				# argument block
-+	# +196	inp
-+	# +200	end
-+	# +204	esp
-+	&sub	("esp",208);
-+	&and	("esp",-64);
++$code.=<<___;
++.type	sha1_block_data_order_avx,\@function,3
++.align	16
++sha1_block_data_order_avx:
++_avx_shortcut:
++	push	%rbx
++	push	%rbp
++	push	%r12
++	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
++___
++$code.=<<___ if ($win64);
++	movaps	%xmm6,64+0(%rsp)
++	movaps	%xmm7,64+16(%rsp)
++	movaps	%xmm8,64+32(%rsp)
++	movaps	%xmm9,64+48(%rsp)
++	movaps	%xmm10,64+64(%rsp)
++.Lprologue_avx:
++___
++$code.=<<___;
++	mov	%rdi,$ctx	# reassigned argument
++	mov	%rsi,$inp	# reassigned argument
++	mov	%rdx,$num	# reassigned argument
++	vzeroall
 +
-+	&vmovdqa(&QWP(112+0,"esp"), at X[4]);	# copy constants
-+	&vmovdqa(&QWP(112+16,"esp"), at X[5]);
-+	&vmovdqa(&QWP(112+32,"esp"), at X[6]);
-+	&shl	($D,6);				# len*64
-+	&vmovdqa(&QWP(112+48,"esp"), at X[3]);
-+	&add	($D,$inp);			# end of input
-+	&vmovdqa(&QWP(112+64,"esp"), at X[2]);
-+	&add	($inp,64);
-+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
-+	&mov	(&DWP(192+4,"esp"),$inp);
-+	&mov	(&DWP(192+8,"esp"),$D);
-+	&mov	(&DWP(192+12,"esp"), at T[0]);	# save original %esp
++	shl	\$6,$num
++	add	$inp,$num
++	lea	K_XX_XX(%rip),$K_XX_XX
 +
-+	&mov	($A,&DWP(0,$E));		# load context
-+	&mov	($B,&DWP(4,$E));
-+	&mov	($C,&DWP(8,$E));
-+	&mov	($D,&DWP(12,$E));
-+	&mov	($E,&DWP(16,$E));
-+	&mov	(@T[0],$B);			# magic seed
++	mov	0($ctx),$A		# load context
++	mov	4($ctx),$B
++	mov	8($ctx),$C
++	mov	12($ctx),$D
++	mov	$B, at T[0]		# magic seed
++	mov	16($ctx),$E
 +
-+	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
-+	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
-+	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
-+	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
-+	&vpshufb(@X[-4&7], at X[-4&7], at X[2]);	# byte swap
-+	&vpshufb(@X[-3&7], at X[-3&7], at X[2]);
-+	&vpshufb(@X[-2&7], at X[-2&7], at X[2]);
-+	&vmovdqa(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
-+	&vpshufb(@X[-1&7], at X[-1&7], at X[2]);
-+	&vpaddd	(@X[0], at X[-4&7], at X[3]);		# add K_00_19
-+	&vpaddd	(@X[1], at X[-3&7], at X[3]);
-+	&vpaddd	(@X[2], at X[-2&7], at X[3]);
-+	&vmovdqa(&QWP(0,"esp"), at X[0]);		# X[]+K xfer to IALU
-+	&vmovdqa(&QWP(0+16,"esp"), at X[1]);
-+	&vmovdqa(&QWP(0+32,"esp"), at X[2]);
-+	&jmp	(&label("loop"));
++	vmovdqa	64($K_XX_XX), at X[2]	# pbswap mask
++	vmovdqa	0($K_XX_XX), at Tx[1]	# K_00_19
++	vmovdqu	0($inp), at X[-4&7]	# load input to %xmm[0-3]
++	vmovdqu	16($inp), at X[-3&7]
++	vmovdqu	32($inp), at X[-2&7]
++	vmovdqu	48($inp), at X[-1&7]
++	vpshufb	@X[2], at X[-4&7], at X[-4&7]	# byte swap
++	add	\$64,$inp
++	vpshufb	@X[2], at X[-3&7], at X[-3&7]
++	vpshufb	@X[2], at X[-2&7], at X[-2&7]
++	vpshufb	@X[2], at X[-1&7], at X[-1&7]
++	vpaddd	@Tx[1], at X[-4&7], at X[0]	# add K_00_19
++	vpaddd	@Tx[1], at X[-3&7], at X[1]
++	vpaddd	@Tx[1], at X[-2&7], at X[2]
++	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
++	vmovdqa	@X[1],16(%rsp)
++	vmovdqa	@X[2],32(%rsp)
++	jmp	.Loop_avx
++___
 +
 +sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
 +{ use integer;
@@ -5699,70 +5556,68 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	  &vpaddd	(@X[3], at X[3], at X[-1&7]);
-+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
++	  &vpaddd	(@Tx[1], at Tx[1], at X[-1&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpsrldq(@X[2], at X[-1&7],4);		# "X[-3]", 3 dwords
++	&vpsrldq(@Tx[0], at X[-1&7],4);	# "X[-3]", 3 dwords
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	&vpxor	(@X[0], at X[0], at X[-4&7]);		# "X[0]"^="X[-16]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpxor	(@X[2], at X[2], at X[-2&7]);		# "X[-3]"^"X[-8]"
++	&vpxor	(@Tx[0], at Tx[0], at X[-2&7]);	# "X[-3]"^"X[-8]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpxor	(@X[0], at X[0], at X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
++	&vpxor	(@X[0], at X[0], at Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
++	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpsrld	(@X[2], at X[0],31);
++	&vpsrld	(@Tx[0], at X[0],31);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpslldq(@X[4], at X[0],12);		# "X[0]"<<96, extract one dword
++	&vpslldq(@Tx[2], at X[0],12);		# "X[0]"<<96, extract one dword
 +	&vpaddd	(@X[0], at X[0], at X[0]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpsrld	(@X[3], at X[4],30);
-+	&vpor	(@X[0], at X[0], at X[2]);		# "X[0]"<<<=1
++	&vpsrld	(@Tx[1], at Tx[2],30);
++	&vpor	(@X[0], at X[0], at Tx[0]);		# "X[0]"<<<=1
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpslld	(@X[4], at X[4],2);
-+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
-+	 eval(shift(@insns));
-+	 eval(shift(@insns));
-+	&vpxor	(@X[0], at X[0], at X[3]);
++	&vpslld	(@Tx[2], at Tx[2],2);
++	&vpxor	(@X[0], at X[0], at Tx[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	&vpxor	(@X[0], at X[0], at X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
++	&vpxor	(@X[0], at X[0], at Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
++	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
++
 +	 foreach (@insns) { eval; }	# remaining instructions [if any]
 +
 +  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
++		push(@Tx,shift(@Tx));
 +}
 +
 +sub Xupdate_avx_32_79()
@@ -5771,34 +5626,33 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
 +  my ($a,$b,$c,$d,$e);
 +
-+	&vpalignr(@X[2], at X[-1&7], at X[-2&7],8);	# compose "X[-6]"
-+	&vpxor	(@X[0], at X[0], at X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
++	&vpalignr(@Tx[0], at X[-1&7], at X[-2&7],8);	# compose "X[-6]"
++	&vpxor	(@X[0], at X[0], at X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
 +	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# rol
 +
-+	&vpxor	(@X[0], at X[0], at X[-7&7]);	# "X[0]"^="X[-28]"
-+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);	# save X[] to backtrace buffer
-+	 eval(shift(@insns));
++	&vpxor	(@X[0], at X[0], at X[-7&7]);		# "X[0]"^="X[-28]"
 +	 eval(shift(@insns));
-+	 if ($Xi%5) {
-+	  &vmovdqa	(@X[4], at X[3]);	# "perpetuate" K_XX_XX...
-+	 } else {			# ... or load next one
-+	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
-+	 }
-+	  &vpaddd	(@X[3], at X[3], at X[-1&7]);
++	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
++	if ($Xi%5) {
++	  &vmovdqa	(@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
++	} else {			# ... or load next one
++	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
++	}
++	  &vpaddd	(@Tx[1], at Tx[1], at X[-1&7]);
 +	 eval(shift(@insns));		# ror
 +	 eval(shift(@insns));
 +
-+	&vpxor	(@X[0], at X[0], at X[2]);		# "X[0]"^="X[-6]"
++	&vpxor	(@X[0], at X[0], at Tx[0]);		# "X[0]"^="X[-6]"
 +	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# rol
 +
-+	&vpsrld	(@X[2], at X[0],30);
-+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer to IALU
++	&vpsrld	(@Tx[0], at X[0],30);
++	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# ror
@@ -5814,20 +5668,21 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	 eval(shift(@insns));		# ror
 +	 eval(shift(@insns));
 +
-+	&vpor	(@X[0], at X[0], at X[2]);	# "X[0]"<<<=2
++	&vpor	(@X[0], at X[0], at Tx[0]);		# "X[0]"<<<=2
 +	 eval(shift(@insns));		# body_20_39
 +	 eval(shift(@insns));
-+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
++	  &vmovdqa	(@Tx[1], at X[0])	if ($Xi<19);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	 eval(shift(@insns));		# ror
++	 eval(shift(@insns));		# rol
 +	 eval(shift(@insns));
 +
 +	 foreach (@insns) { eval; }	# remaining instructions
 +
 +  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
++		push(@Tx,shift(@Tx));
 +}
 +
 +sub Xuplast_avx_80()
@@ -5837,30 +5692,29 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +  my ($a,$b,$c,$d,$e);
 +
 +	 eval(shift(@insns));
-+	  &vpaddd	(@X[3], at X[3], at X[-1&7]);
++	  &vpaddd	(@Tx[1], at Tx[1], at X[-1&7]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
-+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]);	# X[]+K xfer IALU
++	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]);	# X[]+K xfer IALU
 +
 +	 foreach (@insns) { eval; }		# remaining instructions
 +
-+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
-+	&cmp	($inp,&DWP(192+8,"esp"));
-+	&je	(&label("done"));
++	&cmp	($inp,$num);
++	&je	(".Ldone_avx");
 +
-+	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
-+	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
-+	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
-+	&vmovdqu(@X[-3&7],&QWP(16,$inp));
-+	&vmovdqu(@X[-2&7],&QWP(32,$inp));
-+	&vmovdqu(@X[-1&7],&QWP(48,$inp));
++	unshift(@Tx,pop(@Tx));
++
++	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
++	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
++	&vmovdqu(@X[-4&7],"0($inp)");		# load input
++	&vmovdqu(@X[-3&7],"16($inp)");
++	&vmovdqu(@X[-2&7],"32($inp)");
++	&vmovdqu(@X[-1&7],"48($inp)");
++	&vpshufb(@X[-4&7], at X[-4&7], at X[2]);	# byte swap
 +	&add	($inp,64);
-+	&vpshufb(@X[-4&7], at X[-4&7], at X[2]);		# byte swap
-+	&mov	(&DWP(192+4,"esp"),$inp);
-+	&vmovdqa(&QWP(112-16,"esp"), at X[3]);	# borrow last backtrace slot
 +
 +  $Xi=0;
 +}
@@ -5873,15 +5727,15 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpshufb	(@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
++	&vpshufb(@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vpaddd	(@X[$Xi&7], at X[($Xi-4)&7], at X[3]);
++	&vpaddd	(@X[$Xi&7], at X[($Xi-4)&7], at Tx[1]);
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
-+	&vmovdqa	(&QWP(0+16*$Xi,"esp"), at X[$Xi&7]);	# X[]+K xfer to IALU
++	&vmovdqa(eval(16*$Xi)."(%rsp)", at X[$Xi&7]);	# X[]+K xfer to IALU
 +	 eval(shift(@insns));
 +	 eval(shift(@insns));
 +
@@ -5898,7 +5752,10 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	foreach (@insns) { eval; }
 +}
 +
-+&set_label("loop",16);
++$code.=<<___;
++.align	16
++.Loop_avx:
++___
 +	&Xupdate_avx_16_31(\&body_00_19);
 +	&Xupdate_avx_16_31(\&body_00_19);
 +	&Xupdate_avx_16_31(\&body_00_19);
@@ -5923,174 +5780,199 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
 +	&Xloop_avx(\&body_20_39);
 +	&Xloop_avx(\&body_20_39);
 +
-+	&mov	(@T[1],&DWP(192,"esp"));	# update context
-+	&add	($A,&DWP(0, at T[1]));
-+	&add	(@T[0],&DWP(4, at T[1]));		# $b
-+	&add	($C,&DWP(8, at T[1]));
-+	&mov	(&DWP(0, at T[1]),$A);
-+	&add	($D,&DWP(12, at T[1]));
-+	&mov	(&DWP(4, at T[1]), at T[0]);
-+	&add	($E,&DWP(16, at T[1]));
-+	&mov	(&DWP(8, at T[1]),$C);
-+	&mov	($B, at T[0]);
-+	&mov	(&DWP(12, at T[1]),$D);
-+	&mov	(&DWP(16, at T[1]),$E);
-+
-+	&jmp	(&label("loop"));
++$code.=<<___;
++	add	0($ctx),$A			# update context
++	add	4($ctx), at T[0]
++	add	8($ctx),$C
++	add	12($ctx),$D
++	mov	$A,0($ctx)
++	add	16($ctx),$E
++	mov	@T[0],4($ctx)
++	mov	@T[0],$B			# magic seed
++	mov	$C,8($ctx)
++	mov	$D,12($ctx)
++	mov	$E,16($ctx)
++	jmp	.Loop_avx
 +
-+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
++.align	16
++.Ldone_avx:
++___
++				$j=$saved_j; @V=@saved_V;
 +
 +	&Xtail_avx(\&body_20_39);
 +	&Xtail_avx(\&body_20_39);
 +	&Xtail_avx(\&body_20_39);
 +
-+	&vzeroall();
++$code.=<<___;
++	vzeroall
 +
-+	&mov	(@T[1],&DWP(192,"esp"));	# update context
-+	&add	($A,&DWP(0, at T[1]));
-+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
-+	&add	(@T[0],&DWP(4, at T[1]));		# $b
-+	&add	($C,&DWP(8, at T[1]));
-+	&mov	(&DWP(0, at T[1]),$A);
-+	&add	($D,&DWP(12, at T[1]));
-+	&mov	(&DWP(4, at T[1]), at T[0]);
-+	&add	($E,&DWP(16, at T[1]));
-+	&mov	(&DWP(8, at T[1]),$C);
-+	&mov	(&DWP(12, at T[1]),$D);
-+	&mov	(&DWP(16, at T[1]),$E);
-+&function_end("_sha1_block_data_order_avx");
-+}
-+&set_label("K_XX_XX",64);
-+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
-+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
-+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
-+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
-+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
++	add	0($ctx),$A			# update context
++	add	4($ctx), at T[0]
++	add	8($ctx),$C
++	mov	$A,0($ctx)
++	add	12($ctx),$D
++	mov	@T[0],4($ctx)
++	add	16($ctx),$E
++	mov	$C,8($ctx)
++	mov	$D,12($ctx)
++	mov	$E,16($ctx)
++___
++$code.=<<___ if ($win64);
++	movaps	64+0(%rsp),%xmm6
++	movaps	64+16(%rsp),%xmm7
++	movaps	64+32(%rsp),%xmm8
++	movaps	64+48(%rsp),%xmm9
++	movaps	64+64(%rsp),%xmm10
++___
++$code.=<<___;
++	lea	`64+($win64?6*16:0)`(%rsp),%rsi
++	mov	0(%rsi),%r12
++	mov	8(%rsi),%rbp
++	mov	16(%rsi),%rbx
++	lea	24(%rsi),%rsp
++.Lepilogue_avx:
++	ret
++.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
++___
 +}
- &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
++$code.=<<___;
++.align	64
++K_XX_XX:
++.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
++.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
++.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
++.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
++.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
++___
++}}}
++$code.=<<___;
++.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.align	64
+ ___
  
- &asm_finish();
-diff -up openssl-1.0.0d/crypto/x86cpuid.pl.intelopts openssl-1.0.0d/crypto/x86cpuid.pl
---- openssl-1.0.0d/crypto/x86cpuid.pl.intelopts	2010-02-12 18:02:12.000000000 +0100
-+++ openssl-1.0.0d/crypto/x86cpuid.pl	2011-11-03 09:55:42.000000000 +0100
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+@@ -272,25 +1109,73 @@ se_handler:
  
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- push(@INC, "${dir}perlasm", "perlasm");
-@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- 	&pop	("eax");
- 	&xor	("ecx","eax");
- 	&bt	("ecx",21);
--	&jnc	(&label("done"));
-+	&jnc	(&label("generic"));
- 	&xor	("eax","eax");
- 	&cpuid	();
- 	&mov	("edi","eax");		# max value for standard query level
-@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- 	# AMD specific
- 	&mov	("eax",0x80000000);
- 	&cpuid	();
--	&cmp	("eax",0x80000008);
-+	&cmp	("eax",0x80000001);
-+	&jb	(&label("intel"));
-+	&mov	("esi","eax");
-+	&mov	("eax",0x80000001);
-+	&cpuid	();
-+	&or	("ebp","ecx");
-+	&and	("ebp",1<<11|1);	# isolate XOP bit
-+	&cmp	("esi",0x80000008);
- 	&jb	(&label("intel"));
+ 	lea	.Lprologue(%rip),%r10
+ 	cmp	%r10,%rbx		# context->Rip<.Lprologue
+-	jb	.Lin_prologue
++	jb	.Lcommon_seh_tail
  
- 	&mov	("eax",0x80000008);
-@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- 	&mov	("eax",1);
- 	&cpuid	();
- 	&bt	("edx",28);
--	&jnc	(&label("done"));
-+	&jnc	(&label("generic"));
- 	&shr	("ebx",16);
- 	&and	("ebx",0xff);
- 	&cmp	("ebx","esi");
--	&ja	(&label("done"));
-+	&ja	(&label("generic"));
- 	&and	("edx",0xefffffff);	# clear hyper-threading bit
--	&jmp	(&label("done"));
-+	&jmp	(&label("generic"));
- 	
- &set_label("intel");
- 	&cmp	("edi",4);
-@@ -85,27 +92,45 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- &set_label("nocacheinfo");
- 	&mov	("eax",1);
- 	&cpuid	();
-+	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
- 	&cmp	("ebp",0);
--	&jne	(&label("notP4"));
-+	&jne	(&label("notintel"));
-+	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
- 	&and	(&HB("eax"),15);	# familiy ID
- 	&cmp	(&HB("eax"),15);	# P4?
--	&jne	(&label("notP4"));
--	&or	("edx",1<<20);		# use reserved bit to engage RC4_CHAR
--&set_label("notP4");
-+	&jne	(&label("notintel"));
-+	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
-+&set_label("notintel");
- 	&bt	("edx",28);		# test hyper-threading bit
--	&jnc	(&label("done"));
-+	&jnc	(&label("generic"));
- 	&and	("edx",0xefffffff);
- 	&cmp	("edi",0);
--	&je	(&label("done"));
-+	&je	(&label("generic"));
+ 	mov	152($context),%rax	# pull context->Rsp
+ 
+ 	lea	.Lepilogue(%rip),%r10
+ 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+-	jae	.Lin_prologue
++	jae	.Lcommon_seh_tail
  
- 	&or	("edx",0x10000000);
- 	&shr	("ebx",16);
- 	&cmp	(&LB("ebx"),1);
--	&ja	(&label("done"));
-+	&ja	(&label("generic"));
- 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
+ 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
+-	lea	24(%rax),%rax
++	lea	32(%rax),%rax
+ 
+ 	mov	-8(%rax),%rbx
+ 	mov	-16(%rax),%rbp
+ 	mov	-24(%rax),%r12
++	mov	-32(%rax),%r13
+ 	mov	%rbx,144($context)	# restore context->Rbx
+ 	mov	%rbp,160($context)	# restore context->Rbp
+ 	mov	%r12,216($context)	# restore context->R12
++	mov	%r13,224($context)	# restore context->R13
 +
-+&set_label("generic");
-+	&and	("ebp",1<<11);		# isolate AMD XOP flag
-+	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
-+	&mov	("esi","edx");
-+	&or	("ebp","ecx");		# merge AMD XOP flag
++	jmp	.Lcommon_seh_tail
++.size	se_handler,.-se_handler
 +
-+	&bt	("ecx",27);		# check OSXSAVE bit
-+	&jnc	(&label("clear_avx"));
-+	&xor	("ecx","ecx");          # XCR0
-+	&data_byte(0x0f,0x01,0xd0);	# xgetbv
-+	&and	("eax",6);              # isolate XMM and YMM state support
-+	&cmp	("eax",6);
-+	&je	(&label("done"));
-+&set_label("clear_avx");
-+	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
- &set_label("done");
--	&mov	("eax","edx");
--	&mov	("edx","ecx");
-+	&mov	("eax","esi");
-+	&mov	("edx","ebp");
- &function_end("OPENSSL_ia32_cpuid");
++.type	ssse3_handler,\@abi-omnipotent
++.align	16
++ssse3_handler:
++	push	%rsi
++	push	%rdi
++	push	%rbx
++	push	%rbp
++	push	%r12
++	push	%r13
++	push	%r14
++	push	%r15
++	pushfq
++	sub	\$64,%rsp
++
++	mov	120($context),%rax	# pull context->Rax
++	mov	248($context),%rbx	# pull context->Rip
++
++	mov	8($disp),%rsi		# disp->ImageBase
++	mov	56($disp),%r11		# disp->HandlerData
++
++	mov	0(%r11),%r10d		# HandlerData[0]
++	lea	(%rsi,%r10),%r10	# prologue label
++	cmp	%r10,%rbx		# context->Rip<prologue label
++	jb	.Lcommon_seh_tail
++
++	mov	152($context),%rax	# pull context->Rsp
  
- &external_label("OPENSSL_ia32cap_P");
-@@ -199,8 +224,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- 	&bt	(&DWP(0,"ecx"),1);
- 	&jnc	(&label("no_x87"));
- 	if ($sse2) {
--		&bt	(&DWP(0,"ecx"),26);
--		&jnc	(&label("no_sse2"));
-+		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
-+		&cmp	("ecx",1<<26|1<<24);
-+		&jne	(&label("no_sse2"));
- 		&pxor	("xmm0","xmm0");
- 		&pxor	("xmm1","xmm1");
- 		&pxor	("xmm2","xmm2");
-diff -up openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0d/crypto/x86_64cpuid.pl
---- openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts	2010-04-14 21:25:09.000000000 +0200
-+++ openssl-1.0.0d/crypto/x86_64cpuid.pl	2011-08-24 12:50:56.000000000 +0200
+-.Lin_prologue:
++	mov	4(%r11),%r10d		# HandlerData[1]
++	lea	(%rsi,%r10),%r10	# epilogue label
++	cmp	%r10,%rbx		# context->Rip>=epilogue label
++	jae	.Lcommon_seh_tail
++
++	lea	64(%rax),%rsi
++	lea	512($context),%rdi	# &context.Xmm6
++	mov	\$10,%ecx
++	.long	0xa548f3fc		# cld; rep movsq
++	lea	24+5*16(%rax),%rax	# adjust stack pointer
++
++	mov	-8(%rax),%rbx
++	mov	-16(%rax),%rbp
++	mov	%rbx,144($context)	# restore context->Rbx
++	mov	%rbp,160($context)	# restore context->Rbp
++
++.Lcommon_seh_tail:
+ 	mov	8(%rax),%rdi
+ 	mov	16(%rax),%rsi
+ 	mov	%rax,152($context)	# restore context->Rsp
+@@ -328,19 +1213,38 @@ se_handler:
+ 	pop	%rdi
+ 	pop	%rsi
+ 	ret
+-.size	se_handler,.-se_handler
++.size	ssse3_handler,.-ssse3_handler
+ 
+ .section	.pdata
+ .align	4
+ 	.rva	.LSEH_begin_sha1_block_data_order
+ 	.rva	.LSEH_end_sha1_block_data_order
+ 	.rva	.LSEH_info_sha1_block_data_order
+-
++	.rva	.LSEH_begin_sha1_block_data_order_ssse3
++	.rva	.LSEH_end_sha1_block_data_order_ssse3
++	.rva	.LSEH_info_sha1_block_data_order_ssse3
++___
++$code.=<<___ if ($avx);
++	.rva	.LSEH_begin_sha1_block_data_order_avx
++	.rva	.LSEH_end_sha1_block_data_order_avx
++	.rva	.LSEH_info_sha1_block_data_order_avx
++___
++$code.=<<___;
+ .section	.xdata
+ .align	8
+ .LSEH_info_sha1_block_data_order:
+ 	.byte	9,0,0,0
+ 	.rva	se_handler
++.LSEH_info_sha1_block_data_order_ssse3:
++	.byte	9,0,0,0
++	.rva	ssse3_handler
++	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
++___
++$code.=<<___ if ($avx);
++.LSEH_info_sha1_block_data_order_avx:
++	.byte	9,0,0,0
++	.rva	ssse3_handler
++	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+ ___
+ }
+ 
+diff -up openssl-1.0.0k/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0k/crypto/x86_64cpuid.pl
+--- openssl-1.0.0k/crypto/x86_64cpuid.pl.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/x86_64cpuid.pl	2013-02-19 21:21:59.833360113 +0100
 @@ -1,4 +1,4 @@
 -#!/usr/bin/env perl
 +#!/usr/bin/perl
@@ -6101,12 +5983,12 @@ diff -up openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0d/crypto/x8
  $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  
  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
--open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+-open STDOUT,"| \"$^X\" ${dir}perlasm/x86_64-xlate.pl $flavour $output";
 +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 +( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
 +die "can't locate x86_64-xlate.pl";
 +
-+open STDOUT,"| $^X $xlate $flavour $output";
++open STDOUT,"| \"$^X\" $xlate $flavour $output";
 +
 +($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :	# Win64 order
 +				 ("%rdi","%rsi","%rdx","%rcx");	# Unix order
@@ -6219,3 +6101,121 @@ diff -up openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0d/crypto/x8
  	ret
  .size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
  
+diff -up openssl-1.0.0k/crypto/x86cpuid.pl.intelopts openssl-1.0.0k/crypto/x86cpuid.pl
+--- openssl-1.0.0k/crypto/x86cpuid.pl.intelopts	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/x86cpuid.pl	2013-02-19 21:15:39.634408163 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+ 
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ push(@INC, "${dir}perlasm", "perlasm");
+@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ 	&pop	("eax");
+ 	&xor	("ecx","eax");
+ 	&bt	("ecx",21);
+-	&jnc	(&label("done"));
++	&jnc	(&label("generic"));
+ 	&xor	("eax","eax");
+ 	&cpuid	();
+ 	&mov	("edi","eax");		# max value for standard query level
+@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ 	# AMD specific
+ 	&mov	("eax",0x80000000);
+ 	&cpuid	();
+-	&cmp	("eax",0x80000008);
++	&cmp	("eax",0x80000001);
++	&jb	(&label("intel"));
++	&mov	("esi","eax");
++	&mov	("eax",0x80000001);
++	&cpuid	();
++	&or	("ebp","ecx");
++	&and	("ebp",1<<11|1);	# isolate XOP bit
++	&cmp	("esi",0x80000008);
+ 	&jb	(&label("intel"));
+ 
+ 	&mov	("eax",0x80000008);
+@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ 	&mov	("eax",1);
+ 	&cpuid	();
+ 	&bt	("edx",28);
+-	&jnc	(&label("done"));
++	&jnc	(&label("generic"));
+ 	&shr	("ebx",16);
+ 	&and	("ebx",0xff);
+ 	&cmp	("ebx","esi");
+-	&ja	(&label("done"));
++	&ja	(&label("generic"));
+ 	&and	("edx",0xefffffff);	# clear hyper-threading bit
+-	&jmp	(&label("done"));
++	&jmp	(&label("generic"));
+ 	
+ &set_label("intel");
+ 	&cmp	("edi",4);
+@@ -85,27 +92,45 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ &set_label("nocacheinfo");
+ 	&mov	("eax",1);
+ 	&cpuid	();
++	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
+ 	&cmp	("ebp",0);
+-	&jne	(&label("notP4"));
++	&jne	(&label("notintel"));
++	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
+ 	&and	(&HB("eax"),15);	# familiy ID
+ 	&cmp	(&HB("eax"),15);	# P4?
+-	&jne	(&label("notP4"));
+-	&or	("edx",1<<20);		# use reserved bit to engage RC4_CHAR
+-&set_label("notP4");
++	&jne	(&label("notintel"));
++	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
++&set_label("notintel");
+ 	&bt	("edx",28);		# test hyper-threading bit
+-	&jnc	(&label("done"));
++	&jnc	(&label("generic"));
+ 	&and	("edx",0xefffffff);
+ 	&cmp	("edi",0);
+-	&je	(&label("done"));
++	&je	(&label("generic"));
+ 
+ 	&or	("edx",0x10000000);
+ 	&shr	("ebx",16);
+ 	&cmp	(&LB("ebx"),1);
+-	&ja	(&label("done"));
++	&ja	(&label("generic"));
+ 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
++
++&set_label("generic");
++	&and	("ebp",1<<11);		# isolate AMD XOP flag
++	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
++	&mov	("esi","edx");
++	&or	("ebp","ecx");		# merge AMD XOP flag
++
++	&bt	("ecx",27);		# check OSXSAVE bit
++	&jnc	(&label("clear_avx"));
++	&xor	("ecx","ecx");          # XCR0
++	&data_byte(0x0f,0x01,0xd0);	# xgetbv
++	&and	("eax",6);              # isolate XMM and YMM state support
++	&cmp	("eax",6);
++	&je	(&label("done"));
++&set_label("clear_avx");
++	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
+ &set_label("done");
+-	&mov	("eax","edx");
+-	&mov	("edx","ecx");
++	&mov	("eax","esi");
++	&mov	("edx","ebp");
+ &function_end("OPENSSL_ia32_cpuid");
+ 
+ &external_label("OPENSSL_ia32cap_P");
+@@ -199,8 +224,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ 	&bt	(&DWP(0,"ecx"),1);
+ 	&jnc	(&label("no_x87"));
+ 	if ($sse2) {
+-		&bt	(&DWP(0,"ecx"),26);
+-		&jnc	(&label("no_sse2"));
++		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
++		&cmp	("ecx",1<<26|1<<24);
++		&jne	(&label("no_sse2"));
+ 		&pxor	("xmm0","xmm0");
+ 		&pxor	("xmm1","xmm1");
+ 		&pxor	("xmm2","xmm2");
diff --git a/openssl-1.0.0k-secure-getenv.patch b/openssl-1.0.0k-secure-getenv.patch
new file mode 100644
index 0000000..3014b46
--- /dev/null
+++ b/openssl-1.0.0k-secure-getenv.patch
@@ -0,0 +1,154 @@
+diff -up openssl-1.0.0k/crypto/conf/conf_api.c.secure-getenv openssl-1.0.0k/crypto/conf/conf_api.c
+--- openssl-1.0.0k/crypto/conf/conf_api.c.secure-getenv	2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/conf/conf_api.c	2013-02-19 21:25:56.623198152 +0100
+@@ -142,7 +142,7 @@ char *_CONF_get_string(const CONF *conf,
+ 			if (v != NULL) return(v->value);
+ 			if (strcmp(section,"ENV") == 0)
+ 				{
+-				p=getenv(name);
++				p=__secure_getenv(name);
+ 				if (p != NULL) return(p);
+ 				}
+ 			}
+@@ -155,7 +155,7 @@ char *_CONF_get_string(const CONF *conf,
+ 			return(NULL);
+ 		}
+ 	else
+-		return(getenv(name));
++		return (__secure_getenv(name));
+ 	}
+ 
+ #if 0 /* There's no way to provide error checking with this function, so
+diff -up openssl-1.0.0k/crypto/conf/conf_mod.c.secure-getenv openssl-1.0.0k/crypto/conf/conf_mod.c
+--- openssl-1.0.0k/crypto/conf/conf_mod.c.secure-getenv	2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/conf/conf_mod.c	2013-02-19 21:25:56.624198172 +0100
+@@ -548,8 +548,8 @@ char *CONF_get1_default_config_file(void
+ 	char *file;
+ 	int len;
+ 
+-	file = getenv("OPENSSL_CONF");
+-	if (file) 
++	file = __secure_getenv("OPENSSL_CONF");
++	if (file)
+ 		return BUF_strdup(file);
+ 
+ 	len = strlen(X509_get_default_cert_area());
+diff -up openssl-1.0.0k/crypto/engine/eng_list.c.secure-getenv openssl-1.0.0k/crypto/engine/eng_list.c
+--- openssl-1.0.0k/crypto/engine/eng_list.c.secure-getenv	2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/engine/eng_list.c	2013-02-19 21:25:56.625198193 +0100
+@@ -399,9 +399,9 @@ ENGINE *ENGINE_by_id(const char *id)
+ 	if (strcmp(id, "dynamic"))
+ 		{
+ #ifdef OPENSSL_SYS_VMS
+-		if((load_dir = getenv("OPENSSL_ENGINES")) == 0) load_dir = "SSLROOT:[ENGINES]";
++		if(OPENSSL_issetugid() || (load_dir = getenv("OPENSSL_ENGINES")) == 0) load_dir = "SSLROOT:[ENGINES]";
+ #else
+-		if((load_dir = getenv("OPENSSL_ENGINES")) == 0) load_dir = ENGINESDIR;
++		if((load_dir = __secure_getenv("OPENSSL_ENGINES")) == 0) load_dir = ENGINESDIR;
+ #endif
+ 		iterator = ENGINE_by_id("dynamic");
+ 		if(!iterator || !ENGINE_ctrl_cmd_string(iterator, "ID", id, 0) ||
+diff -up openssl-1.0.0k/crypto/md5/md5_dgst.c.secure-getenv openssl-1.0.0k/crypto/md5/md5_dgst.c
+--- openssl-1.0.0k/crypto/md5/md5_dgst.c.secure-getenv	2013-02-19 21:25:56.000000000 +0100
++++ openssl-1.0.0k/crypto/md5/md5_dgst.c	2013-02-19 21:27:02.814550574 +0100
+@@ -78,7 +78,7 @@ const char MD5_version[]="MD5" OPENSSL_V
+ int MD5_Init(MD5_CTX *c)
+ #ifdef OPENSSL_FIPS
+ 	{
+-	if (FIPS_mode() && getenv("OPENSSL_FIPS_NON_APPROVED_MD5_ALLOW") == NULL)
++	if (FIPS_mode() && __secure_getenv("OPENSSL_FIPS_NON_APPROVED_MD5_ALLOW") == NULL)
+ 		FIPS_BAD_ALGORITHM(alg)
+ 	return private_MD5_Init(c);
+ 	}
+diff -up openssl-1.0.0k/crypto/o_init.c.secure-getenv openssl-1.0.0k/crypto/o_init.c
+--- openssl-1.0.0k/crypto/o_init.c.secure-getenv	2013-02-19 21:25:56.491195456 +0100
++++ openssl-1.0.0k/crypto/o_init.c	2013-02-19 21:25:56.628198256 +0100
+@@ -75,7 +75,7 @@ static void init_fips_mode(void)
+ 	char buf[2] = "0";
+ 	int fd;
+ 	
+-	if (getenv("OPENSSL_FORCE_FIPS_MODE") != NULL)
++	if (__secure_getenv("OPENSSL_FORCE_FIPS_MODE") != NULL)
+ 		{
+ 		buf[0] = '1';
+ 		}
+diff -up openssl-1.0.0k/crypto/rand/randfile.c.secure-getenv openssl-1.0.0k/crypto/rand/randfile.c
+--- openssl-1.0.0k/crypto/rand/randfile.c.secure-getenv	2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/randfile.c	2013-02-19 21:25:56.630198296 +0100
+@@ -275,8 +275,7 @@ const char *RAND_file_name(char *buf, si
+ 	struct stat sb;
+ #endif
+ 
+-	if (OPENSSL_issetugid() == 0)
+-		s=getenv("RANDFILE");
++	s=__secure_getenv("RANDFILE");
+ 	if (s != NULL && *s && strlen(s) + 1 < size)
+ 		{
+ 		if (BUF_strlcpy(buf,s,size) >= size)
+@@ -284,8 +283,7 @@ const char *RAND_file_name(char *buf, si
+ 		}
+ 	else
+ 		{
+-		if (OPENSSL_issetugid() == 0)
+-			s=getenv("HOME");
++		s=__secure_getenv("HOME");
+ #ifdef DEFAULT_HOME
+ 		if (s == NULL)
+ 			{
+diff -up openssl-1.0.0k/crypto/x509/by_dir.c.secure-getenv openssl-1.0.0k/crypto/x509/by_dir.c
+--- openssl-1.0.0k/crypto/x509/by_dir.c.secure-getenv	2013-02-05 12:47:29.000000000 +0100
++++ openssl-1.0.0k/crypto/x509/by_dir.c	2013-02-19 21:25:56.638198460 +0100
+@@ -135,7 +135,7 @@ static int dir_ctrl(X509_LOOKUP *ctx, in
+ 	case X509_L_ADD_DIR:
+ 		if (argl == X509_FILETYPE_DEFAULT)
+ 			{
+-			dir=(char *)getenv(X509_get_default_cert_dir_env());
++			dir=(char *)__secure_getenv(X509_get_default_cert_dir_env());
+ 			if (dir)
+ 				ret=add_cert_dir(ld,dir,X509_FILETYPE_PEM);
+ 			else
+diff -up openssl-1.0.0k/crypto/x509/by_file.c.secure-getenv openssl-1.0.0k/crypto/x509/by_file.c
+--- openssl-1.0.0k/crypto/x509/by_file.c.secure-getenv	2013-02-19 21:25:56.431194229 +0100
++++ openssl-1.0.0k/crypto/x509/by_file.c	2013-02-19 21:25:56.639198480 +0100
+@@ -100,7 +100,7 @@ static int by_file_ctrl(X509_LOOKUP *ctx
+ 	case X509_L_FILE_LOAD:
+ 		if (argl == X509_FILETYPE_DEFAULT)
+ 			{
+-			file = (char *)getenv(X509_get_default_cert_file_env());
++			file = (char *)__secure_getenv(X509_get_default_cert_file_env());
+ 			if (file)
+ 				ok = (X509_load_cert_crl_file(ctx,file,
+ 					      X509_FILETYPE_PEM) != 0);
+diff -up openssl-1.0.0k/crypto/x509/x509_vfy.c.secure-getenv openssl-1.0.0k/crypto/x509/x509_vfy.c
+--- openssl-1.0.0k/crypto/x509/x509_vfy.c.secure-getenv	2013-02-05 12:47:29.000000000 +0100
++++ openssl-1.0.0k/crypto/x509/x509_vfy.c	2013-02-19 21:25:56.642198540 +0100
+@@ -481,7 +481,7 @@ static int check_chain_extensions(X509_S
+ 			!!(ctx->param->flags & X509_V_FLAG_ALLOW_PROXY_CERTS);
+ 		/* A hack to keep people who don't want to modify their
+ 		   software happy */
+-		if (getenv("OPENSSL_ALLOW_PROXY_CERTS"))
++		if (__secure_getenv("OPENSSL_ALLOW_PROXY_CERTS"))
+ 			allow_proxy_certs = 1;
+ 		purpose = ctx->param->purpose;
+ 		}
+diff -up openssl-1.0.0k/engines/ccgost/gost_ctl.c.secure-getenv openssl-1.0.0k/engines/ccgost/gost_ctl.c
+--- openssl-1.0.0k/engines/ccgost/gost_ctl.c.secure-getenv	2013-02-05 12:47:29.000000000 +0100
++++ openssl-1.0.0k/engines/ccgost/gost_ctl.c	2013-02-19 21:25:56.643198560 +0100
+@@ -65,7 +65,7 @@ const char *get_gost_engine_param(int pa
+ 		{
+ 		return gost_params[param];
+ 		}
+-	tmp = getenv(gost_envnames[param]);
++	tmp = __secure_getenv(gost_envnames[param]);
+ 	if (tmp) 
+ 		{
+ 		if (gost_params[param]) OPENSSL_free(gost_params[param]);
+@@ -79,7 +79,7 @@ int gost_set_default_param(int param, co
+ 	{
+ 	const char *tmp;
+ 	if (param <0 || param >GOST_PARAM_MAX) return 0;
+-	tmp = getenv(gost_envnames[param]);
++	tmp = __secure_getenv(gost_envnames[param]);
+ 	/* if there is value in the environment, use it, else -passed string * */
+ 	if (!tmp) tmp=value;
+ 	if (gost_params[param]) OPENSSL_free(gost_params[param]);
diff --git a/openssl-1.0.0k-version.patch b/openssl-1.0.0k-version.patch
new file mode 100644
index 0000000..f08fab4
--- /dev/null
+++ b/openssl-1.0.0k-version.patch
@@ -0,0 +1,21 @@
+diff -up openssl-1.0.0k/crypto/opensslv.h.version openssl-1.0.0k/crypto/opensslv.h
+--- openssl-1.0.0k/crypto/opensslv.h.version	2013-02-19 21:12:26.903472656 +0100
++++ openssl-1.0.0k/crypto/opensslv.h	2013-02-19 21:14:35.613100870 +0100
+@@ -25,7 +25,7 @@
+  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
+  *  major minor fix final patch/beta)
+  */
+-#define OPENSSL_VERSION_NUMBER	0x100000bfL
++#define OPENSSL_VERSION_NUMBER	0x10000003L
+ #ifdef OPENSSL_FIPS
+ #define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0k-fips 5 Feb 2013"
+ #else
+@@ -83,7 +83,7 @@
+  * should only keep the versions that are binary compatible with the current.
+  */
+ #define SHLIB_VERSION_HISTORY ""
+-#define SHLIB_VERSION_NUMBER "1.0.0"
++#define SHLIB_VERSION_NUMBER "1.0.0k"
+ 
+ 
+ #endif /* HEADER_OPENSSLV_H */
diff --git a/openssl-1.0.1e-env-zlib.patch b/openssl-1.0.1e-env-zlib.patch
new file mode 100644
index 0000000..297d3a3
--- /dev/null
+++ b/openssl-1.0.1e-env-zlib.patch
@@ -0,0 +1,29 @@
+diff -up openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod.env-zlib openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod
+--- openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod.env-zlib	2013-02-11 16:02:48.000000000 +0100
++++ openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod	2013-02-19 16:32:51.000000000 +0100
+@@ -47,6 +47,13 @@ Once the identities of the compression m
+ been standardized, the compression API will most likely be changed. Using
+ it in the current state is not recommended.
+ 
++It is also not recommended to use compression if data transfered contain
++untrusted parts that can be manipulated by an attacker as he could then
++get information about the encrypted data. See the CRIME attack. For
++that reason the default loading of the zlib compression method is
++disabled and enabled only if the environment variable B<OPENSSL_DEFAULT_ZLIB>
++is present during the library initialization.
++
+ =head1 RETURN VALUES
+ 
+ SSL_COMP_add_compression_method() may return the following values:
+diff -up openssl-1.0.1e/ssl/ssl_ciph.c.env-zlib openssl-1.0.1e/ssl/ssl_ciph.c
+--- openssl-1.0.1e/ssl/ssl_ciph.c.env-zlib	2013-02-11 16:26:04.000000000 +0100
++++ openssl-1.0.1e/ssl/ssl_ciph.c	2013-02-19 16:37:36.163545085 +0100
+@@ -455,7 +455,7 @@ static void load_builtin_compressions(vo
+ 
+ 			MemCheck_off();
+ 			ssl_comp_methods=sk_SSL_COMP_new(sk_comp_cmp);
+-			if (ssl_comp_methods != NULL)
++			if (ssl_comp_methods != NULL && __secure_getenv("OPENSSL_DEFAULT_ZLIB") != NULL)
+ 				{
+ 				comp=(SSL_COMP *)OPENSSL_malloc(sizeof(SSL_COMP));
+ 				if (comp != NULL)
diff --git a/openssl.spec b/openssl.spec
index c03cc34..14d35a9 100644
--- a/openssl.spec
+++ b/openssl.spec
@@ -20,8 +20,8 @@
 
 Summary: A general purpose cryptography library with TLS implementation
 Name: openssl
-Version: 1.0.0j
-Release: 2%{?dist}
+Version: 1.0.0k
+Release: 1%{?dist}
 Epoch: 1
 # We have to remove certain patented algorithms from the openssl source
 # tarball with the hobble-openssl script which is included below.
@@ -52,15 +52,15 @@ Patch34: openssl-0.9.6-x509.patch
 Patch35: openssl-0.9.8j-version-add-engines.patch
 Patch38: openssl-1.0.0-beta5-cipher-change.patch
 Patch39: openssl-1.0.0b-ipv6-apps.patch
-Patch40: openssl-1.0.0f-fips.patch
+Patch40: openssl-1.0.0k-fips.patch
 Patch41: openssl-1.0.0-beta3-fipscheck.patch
 Patch43: openssl-1.0.0a-fipsmode.patch
 Patch44: openssl-1.0.0-beta3-fipsrng.patch
-Patch45: openssl-0.9.8j-env-nozlib.patch
+Patch45: openssl-1.0.1e-env-zlib.patch
 Patch47: openssl-1.0.0-beta5-readme-warning.patch
 Patch49: openssl-1.0.1a-algo-doc.patch
 Patch50: openssl-1.0.0-beta4-dtls1-abi.patch
-Patch51: openssl-1.0.0j-version.patch
+Patch51: openssl-1.0.0k-version.patch
 Patch52: openssl-1.0.0b-aesni.patch
 Patch53: openssl-1.0.0-name-hash.patch
 Patch54: openssl-1.0.0c-speed-fips.patch
@@ -73,11 +73,13 @@ Patch60: openssl-1.0.0d-apps-dgst.patch
 Patch61: openssl-1.0.0d-cavs.patch
 Patch62: openssl-1.0.0-fips-aesni.patch
 Patch63: openssl-1.0.0d-xmpp-starttls.patch
-Patch64: openssl-1.0.0d-intelopts.patch
+Patch64: openssl-1.0.0k-intelopts.patch
 Patch65: openssl-1.0.0e-chil-fixes.patch
 Patch66: openssl-1.0.0-sha2test.patch
+Patch67: openssl-1.0.0k-secure-getenv.patch
 # Backported fixes including security fixes
 Patch81: openssl-1.0.0d-padlock64.patch
+Patch82: openssl-1.0.0k-backports.patch
 
 License: OpenSSL
 Group: System Environment/Libraries
@@ -153,7 +155,7 @@ from other formats to the formats used by the OpenSSL toolkit.
 %patch41 -p1 -b .fipscheck
 %patch43 -p1 -b .fipsmode
 %patch44 -p1 -b .fipsrng
-%patch45 -p1 -b .env-nozlib
+%patch45 -p1 -b .env-zlib
 %patch47 -p1 -b .warning
 %patch49 -p1 -b .algo-doc
 %patch50 -p1 -b .dtls1-abi
@@ -173,8 +175,10 @@ from other formats to the formats used by the OpenSSL toolkit.
 %patch64 -p1 -b .intelopts
 %patch65 -p1 -b .chil
 %patch66 -p1 -b .sha2test
+%patch67 -p1 -b .secure-getenv
 
 %patch81 -p1 -b .padlock64
+%patch82 -p1 -b .backports
 
 # Modify the various perl scripts to reference perl in the right location.
 perl util/perlpath.pl `dirname %{__perl}`
@@ -424,6 +428,9 @@ rm -rf $RPM_BUILD_ROOT/%{_libdir}/fipscanister.*
 %postun -p /sbin/ldconfig
 
 %changelog
+* Tue Feb 19 2013 Tomas Mraz <tmraz at redhat.com> 1.0.0k-1
+- new upstream release fixing multiple CVEs
+
 * Thu Jul 12 2012 Tomas Mraz <tmraz at redhat.com> 1.0.0j-2
 - fix s_server with new glibc when no global IPv6 address (#839031)
 
diff --git a/sources b/sources
index 4a4e3bd..79c0f0b 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-f6eff5c8ba4db07d702163ba2f37757c  openssl-1.0.0j-usa.tar.xz
+a8109e845ff32b19fd928f7dfbcebf66  openssl-1.0.0k-usa.tar.xz


More information about the scm-commits mailing list