[openssl/f17] new upstream release fixing multiple CVEs
Tomáš Mráz
tmraz at fedoraproject.org
Tue Feb 19 20:57:11 UTC 2013
commit 36472b541d49ddaa5488d37aaf4cba1b45180e25
Author: Tomas Mraz <tmraz at fedoraproject.org>
Date: Tue Feb 19 21:57:05 2013 +0100
new upstream release fixing multiple CVEs
.gitignore | 1 +
openssl-0.9.8j-env-nozlib.patch | 13 -
openssl-1.0.0j-version.patch | 21 -
openssl-1.0.0k-backports.patch | 775 ++
...-1.0.0f-fips.patch => openssl-1.0.0k-fips.patch | 1233 ++--
...telopts.patch => openssl-1.0.0k-intelopts.patch | 9656 ++++++++++----------
openssl-1.0.0k-secure-getenv.patch | 154 +
openssl-1.0.0k-version.patch | 21 +
openssl-1.0.1e-env-zlib.patch | 29 +
openssl.spec | 21 +-
sources | 2 +-
11 files changed, 6438 insertions(+), 5488 deletions(-)
---
diff --git a/.gitignore b/.gitignore
index 47c7de5..59ed437 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ openssl-1.0.0a-usa.tar.bz2
/openssl-1.0.0h-usa.tar.xz
/openssl-1.0.0i-usa.tar.xz
/openssl-1.0.0j-usa.tar.xz
+/openssl-1.0.0k-usa.tar.xz
diff --git a/openssl-1.0.0k-backports.patch b/openssl-1.0.0k-backports.patch
new file mode 100644
index 0000000..05661e9
--- /dev/null
+++ b/openssl-1.0.0k-backports.patch
@@ -0,0 +1,775 @@
+diff --git a/doc/crypto/X509_STORE_CTX_get_error.pod b/doc/crypto/X509_STORE_CTX_get_error.pod
+index a883f6c..60e8332 100644
+--- a/doc/crypto/X509_STORE_CTX_get_error.pod
++++ b/doc/crypto/X509_STORE_CTX_get_error.pod
+@@ -278,6 +278,8 @@ happen if extended CRL checking is enabled.
+ an application specific error. This will never be returned unless explicitly
+ set by an application.
+
++=back
++
+ =head1 NOTES
+
+ The above functions should be used instead of directly referencing the fields
+diff --git a/doc/ssl/SSL_CTX_set_client_CA_list.pod b/doc/ssl/SSL_CTX_set_client_CA_list.pod
+index 632b556..5e66133 100644
+--- a/doc/ssl/SSL_CTX_set_client_CA_list.pod
++++ b/doc/ssl/SSL_CTX_set_client_CA_list.pod
+@@ -66,16 +66,16 @@ values:
+
+ =over 4
+
+-=item 1
+-
+-The operation succeeded.
+-
+ =item 0
+
+ A failure while manipulating the STACK_OF(X509_NAME) object occurred or
+ the X509_NAME could not be extracted from B<cacert>. Check the error stack
+ to find out the reason.
+
++=item 1
++
++The operation succeeded.
++
+ =back
+
+ =head1 EXAMPLES
+diff --git a/doc/ssl/SSL_CTX_use_psk_identity_hint.pod b/doc/ssl/SSL_CTX_use_psk_identity_hint.pod
+index b80e25b..7e60df5 100644
+--- a/doc/ssl/SSL_CTX_use_psk_identity_hint.pod
++++ b/doc/ssl/SSL_CTX_use_psk_identity_hint.pod
+@@ -81,6 +81,8 @@ SSL_CTX_use_psk_identity_hint() and SSL_use_psk_identity_hint() return
+
+ Return values from the server callback are interpreted as follows:
+
++=over 4
++
+ =item > 0
+
+ PSK identity was found and the server callback has provided the PSK
+@@ -99,4 +101,6 @@ completely.
+ PSK identity was not found. An "unknown_psk_identity" alert message
+ will be sent and the connection setup fails.
+
++=back
++
+ =cut
+diff --git a/doc/ssl/SSL_accept.pod b/doc/ssl/SSL_accept.pod
+index cc724c0..b1c34d1 100644
+--- a/doc/ssl/SSL_accept.pod
++++ b/doc/ssl/SSL_accept.pod
+@@ -44,17 +44,17 @@ The following return values can occur:
+
+ =over 4
+
+-=item 1
+-
+-The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
+-established.
+-
+ =item 0
+
+ The TLS/SSL handshake was not successful but was shut down controlled and
+ by the specifications of the TLS/SSL protocol. Call SSL_get_error() with the
+ return value B<ret> to find out the reason.
+
++=item 1
++
++The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
++established.
++
+ =item E<lt>0
+
+ The TLS/SSL handshake was not successful because a fatal error occurred either
+diff --git a/doc/ssl/SSL_connect.pod b/doc/ssl/SSL_connect.pod
+index cc56ebb..946ca89 100644
+--- a/doc/ssl/SSL_connect.pod
++++ b/doc/ssl/SSL_connect.pod
+@@ -41,17 +41,17 @@ The following return values can occur:
+
+ =over 4
+
+-=item 1
+-
+-The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
+-established.
+-
+ =item 0
+
+ The TLS/SSL handshake was not successful but was shut down controlled and
+ by the specifications of the TLS/SSL protocol. Call SSL_get_error() with the
+ return value B<ret> to find out the reason.
+
++=item 1
++
++The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
++established.
++
+ =item E<lt>0
+
+ The TLS/SSL handshake was not successful, because a fatal error occurred either
+diff --git a/doc/ssl/SSL_do_handshake.pod b/doc/ssl/SSL_do_handshake.pod
+index 2435764..7f8cf24 100644
+--- a/doc/ssl/SSL_do_handshake.pod
++++ b/doc/ssl/SSL_do_handshake.pod
+@@ -45,17 +45,17 @@ The following return values can occur:
+
+ =over 4
+
+-=item 1
+-
+-The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
+-established.
+-
+ =item 0
+
+ The TLS/SSL handshake was not successful but was shut down controlled and
+ by the specifications of the TLS/SSL protocol. Call SSL_get_error() with the
+ return value B<ret> to find out the reason.
+
++=item 1
++
++The TLS/SSL handshake was successfully completed, a TLS/SSL connection has been
++established.
++
+ =item E<lt>0
+
+ The TLS/SSL handshake was not successful because a fatal error occurred either
+diff --git a/doc/ssl/SSL_shutdown.pod b/doc/ssl/SSL_shutdown.pod
+index 89911ac..42a89b7 100644
+--- a/doc/ssl/SSL_shutdown.pod
++++ b/doc/ssl/SSL_shutdown.pod
+@@ -92,11 +92,6 @@ The following return values can occur:
+
+ =over 4
+
+-=item 1
+-
+-The shutdown was successfully completed. The "close notify" alert was sent
+-and the peer's "close notify" alert was received.
+-
+ =item 0
+
+ The shutdown is not yet finished. Call SSL_shutdown() for a second time,
+@@ -104,6 +99,11 @@ if a bidirectional shutdown shall be performed.
+ The output of L<SSL_get_error(3)|SSL_get_error(3)> may be misleading, as an
+ erroneous SSL_ERROR_SYSCALL may be flagged even though no error occurred.
+
++=item 1
++
++The shutdown was successfully completed. The "close notify" alert was sent
++and the peer's "close notify" alert was received.
++
+ =item -1
+
+ The shutdown was not successful because a fatal error occurred either
+diff --git a/ssl/d1_pkt.c b/ssl/d1_pkt.c
+index 3c81786..9b013e4 100644
+--- a/ssl/d1_pkt.c
++++ b/ssl/d1_pkt.c
+@@ -371,7 +371,7 @@ dtls1_process_record(SSL *s)
+ int enc_err;
+ SSL_SESSION *sess;
+ SSL3_RECORD *rr;
+- unsigned int mac_size;
++ unsigned int mac_size, orig_len;
+ unsigned char md[EVP_MAX_MD_SIZE];
+
+ rr= &(s->s3->rrec);
+@@ -402,7 +402,6 @@ dtls1_process_record(SSL *s)
+
+ /* decrypt in place in 'rr->input' */
+ rr->data=rr->input;
+- rr->orig_len=rr->length;
+
+ enc_err = s->method->ssl3_enc->enc(s,0);
+ /* enc_err is:
+@@ -434,15 +433,18 @@ printf("\n");
+ mac_size=EVP_MD_CTX_size(s->read_hash);
+ OPENSSL_assert(mac_size <= EVP_MAX_MD_SIZE);
+
++ /* kludge: *_cbc_remove_padding passes padding length in rr->type */
++ orig_len = rr->length+((unsigned int)rr->type>>8);
++
+ /* orig_len is the length of the record before any padding was
+ * removed. This is public information, as is the MAC in use,
+ * therefore we can safely process the record in a different
+ * amount of time if it's too short to possibly contain a MAC.
+ */
+- if (rr->orig_len < mac_size ||
++ if (orig_len < mac_size ||
+ /* CBC records must have a padding length byte too. */
+ (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+- rr->orig_len < mac_size+1))
++ orig_len < mac_size+1))
+ {
+ al=SSL_AD_DECODE_ERROR;
+ SSLerr(SSL_F_DTLS1_PROCESS_RECORD,SSL_R_LENGTH_TOO_SHORT);
+@@ -457,12 +459,12 @@ printf("\n");
+ * without leaking the contents of the padding bytes.
+ * */
+ mac = mac_tmp;
+- ssl3_cbc_copy_mac(mac_tmp, rr, mac_size);
++ ssl3_cbc_copy_mac(mac_tmp, rr, mac_size, orig_len);
+ rr->length -= mac_size;
+ }
+ else
+ {
+- /* In this case there's no padding, so |rec->orig_len|
++ /* In this case there's no padding, so |orig_len|
+ * equals |rec->length| and we checked that there's
+ * enough bytes for |mac_size| above. */
+ rr->length -= mac_size;
+diff --git a/ssl/s3_cbc.c b/ssl/s3_cbc.c
+index dc3fd3e..61413b8 100644
+--- a/ssl/s3_cbc.c
++++ b/ssl/s3_cbc.c
+@@ -76,6 +76,13 @@
+ #define DUPLICATE_MSB_TO_ALL(x) ( (unsigned)( (int)(x) >> (sizeof(int)*8-1) ) )
+ #define DUPLICATE_MSB_TO_ALL_8(x) ((unsigned char)(DUPLICATE_MSB_TO_ALL(x)))
+
++/* constant_time_lt returns 0xff if a<b and 0x00 otherwise. */
++static unsigned constant_time_lt(unsigned a, unsigned b)
++ {
++ a -= b;
++ return DUPLICATE_MSB_TO_ALL(a);
++ }
++
+ /* constant_time_ge returns 0xff if a>=b and 0x00 otherwise. */
+ static unsigned constant_time_ge(unsigned a, unsigned b)
+ {
+@@ -84,7 +91,7 @@ static unsigned constant_time_ge(unsigned a, unsigned b)
+ }
+
+ /* constant_time_eq_8 returns 0xff if a==b and 0x00 otherwise. */
+-static unsigned char constant_time_eq_8(unsigned char a, unsigned char b)
++static unsigned char constant_time_eq_8(unsigned a, unsigned b)
+ {
+ unsigned c = a ^ b;
+ c--;
+@@ -116,7 +123,9 @@ int ssl3_cbc_remove_padding(const SSL* s,
+ good = constant_time_ge(rec->length, padding_length+overhead);
+ /* SSLv3 requires that the padding is minimal. */
+ good &= constant_time_ge(block_size, padding_length+1);
+- rec->length -= good & (padding_length+1);
++ padding_length = good & (padding_length+1);
++ rec->length -= padding_length;
++ rec->type |= padding_length<<8; /* kludge: pass padding length */
+ return (int)((good & 1) | (~good & -1));
+ }
+
+@@ -137,14 +146,21 @@ int tls1_cbc_remove_padding(const SSL* s,
+ unsigned mac_size)
+ {
+ unsigned padding_length, good, to_check, i;
+- const char has_explicit_iv = s->version == DTLS1_VERSION;
+- const unsigned overhead = 1 /* padding length byte */ +
+- mac_size +
+- (has_explicit_iv ? block_size : 0);
+-
+- /* These lengths are all public so we can test them in non-constant
+- * time. */
+- if (overhead > rec->length)
++ const unsigned overhead = 1 /* padding length byte */ + mac_size;
++ /* Check if version requires explicit IV */
++ if (s->version == DTLS1_VERSION || s->version == DTLS1_BAD_VER)
++ {
++ /* These lengths are all public so we can test them in
++ * non-constant time.
++ */
++ if (overhead + block_size > rec->length)
++ return 0;
++ /* We can now safely skip explicit IV */
++ rec->data += block_size;
++ rec->input += block_size;
++ rec->length -= block_size;
++ }
++ else if (overhead > rec->length)
+ return 0;
+
+ padding_length = rec->data[rec->length-1];
+@@ -202,31 +218,13 @@ int tls1_cbc_remove_padding(const SSL* s,
+ good <<= sizeof(good)*8-1;
+ good = DUPLICATE_MSB_TO_ALL(good);
+
+- rec->length -= good & (padding_length+1);
+-
+- /* We can always safely skip the explicit IV. We check at the beginning
+- * of this function that the record has at least enough space for the
+- * IV, MAC and padding length byte. (These can be checked in
+- * non-constant time because it's all public information.) So, if the
+- * padding was invalid, then we didn't change |rec->length| and this is
+- * safe. If the padding was valid then we know that we have at least
+- * overhead+padding_length bytes of space and so this is still safe
+- * because overhead accounts for the explicit IV. */
+- if (has_explicit_iv)
+- {
+- rec->data += block_size;
+- rec->input += block_size;
+- rec->length -= block_size;
+- rec->orig_len -= block_size;
+- }
++ padding_length = good & (padding_length+1);
++ rec->length -= padding_length;
++ rec->type |= padding_length<<8; /* kludge: pass padding length */
+
+ return (int)((good & 1) | (~good & -1));
+ }
+
+-#if defined(_M_AMD64) || defined(__x86_64__)
+-#define CBC_MAC_ROTATE_IN_PLACE
+-#endif
+-
+ /* ssl3_cbc_copy_mac copies |md_size| bytes from the end of |rec| to |out| in
+ * constant time (independent of the concrete value of rec->length, which may
+ * vary within a 256-byte window).
+@@ -240,15 +238,18 @@ int tls1_cbc_remove_padding(const SSL* s,
+ *
+ * If CBC_MAC_ROTATE_IN_PLACE is defined then the rotation is performed with
+ * variable accesses in a 64-byte-aligned buffer. Assuming that this fits into
+- * a single cache-line, then the variable memory accesses don't actually affect
+- * the timing. This has been tested to be true on Intel amd64 chips.
++ * a single or pair of cache-lines, then the variable memory accesses don't
++ * actually affect the timing. CPUs with smaller cache-lines [if any] are
++ * not multi-core and are not considered vulnerable to cache-timing attacks.
+ */
++#define CBC_MAC_ROTATE_IN_PLACE
++
+ void ssl3_cbc_copy_mac(unsigned char* out,
+ const SSL3_RECORD *rec,
+- unsigned md_size)
++ unsigned md_size,unsigned orig_len)
+ {
+ #if defined(CBC_MAC_ROTATE_IN_PLACE)
+- unsigned char rotated_mac_buf[EVP_MAX_MD_SIZE*2];
++ unsigned char rotated_mac_buf[64+EVP_MAX_MD_SIZE];
+ unsigned char *rotated_mac;
+ #else
+ unsigned char rotated_mac[EVP_MAX_MD_SIZE];
+@@ -264,16 +265,16 @@ void ssl3_cbc_copy_mac(unsigned char* out,
+ unsigned div_spoiler;
+ unsigned rotate_offset;
+
+- OPENSSL_assert(rec->orig_len >= md_size);
++ OPENSSL_assert(orig_len >= md_size);
+ OPENSSL_assert(md_size <= EVP_MAX_MD_SIZE);
+
+ #if defined(CBC_MAC_ROTATE_IN_PLACE)
+- rotated_mac = (unsigned char*) (((intptr_t)(rotated_mac_buf + 64)) & ~63);
++ rotated_mac = rotated_mac_buf + ((0-(size_t)rotated_mac_buf)&63);
+ #endif
+
+ /* This information is public so it's safe to branch based on it. */
+- if (rec->orig_len > md_size + 255 + 1)
+- scan_start = rec->orig_len - (md_size + 255 + 1);
++ if (orig_len > md_size + 255 + 1)
++ scan_start = orig_len - (md_size + 255 + 1);
+ /* div_spoiler contains a multiple of md_size that is used to cause the
+ * modulo operation to be constant time. Without this, the time varies
+ * based on the amount of padding when running on Intel chips at least.
+@@ -286,16 +287,13 @@ void ssl3_cbc_copy_mac(unsigned char* out,
+ rotate_offset = (div_spoiler + mac_start - scan_start) % md_size;
+
+ memset(rotated_mac, 0, md_size);
+- for (i = scan_start; i < rec->orig_len;)
++ for (i = scan_start, j = 0; i < orig_len; i++)
+ {
+- for (j = 0; j < md_size && i < rec->orig_len; i++, j++)
+- {
+- unsigned char mac_started = constant_time_ge(i, mac_start);
+- unsigned char mac_ended = constant_time_ge(i, mac_end);
+- unsigned char b = 0;
+- b = rec->data[i];
+- rotated_mac[j] |= b & mac_started & ~mac_ended;
+- }
++ unsigned char mac_started = constant_time_ge(i, mac_start);
++ unsigned char mac_ended = constant_time_ge(i, mac_end);
++ unsigned char b = rec->data[i];
++ rotated_mac[j++] |= b & mac_started & ~mac_ended;
++ j &= constant_time_lt(j,md_size);
+ }
+
+ /* Now rotate the MAC */
+@@ -303,30 +301,43 @@ void ssl3_cbc_copy_mac(unsigned char* out,
+ j = 0;
+ for (i = 0; i < md_size; i++)
+ {
+- unsigned char offset = (div_spoiler + rotate_offset + i) % md_size;
+- out[j++] = rotated_mac[offset];
++ /* in case cache-line is 32 bytes, touch second line */
++ ((volatile unsigned char *)rotated_mac)[rotate_offset^32];
++ out[j++] = rotated_mac[rotate_offset++];
++ rotate_offset &= constant_time_lt(rotate_offset,md_size);
+ }
+ #else
+ memset(out, 0, md_size);
++ rotate_offset = md_size - rotate_offset;
++ rotate_offset &= constant_time_lt(rotate_offset,md_size);
+ for (i = 0; i < md_size; i++)
+ {
+- unsigned char offset = (div_spoiler + md_size - rotate_offset + i) % md_size;
+ for (j = 0; j < md_size; j++)
+- out[j] |= rotated_mac[i] & constant_time_eq_8(j, offset);
++ out[j] |= rotated_mac[i] & constant_time_eq_8(j, rotate_offset);
++ rotate_offset++;
++ rotate_offset &= constant_time_lt(rotate_offset,md_size);
+ }
+ #endif
+ }
+
++/* u32toLE serialises an unsigned, 32-bit number (n) as four bytes at (p) in
++ * little-endian order. The value of p is advanced by four. */
++#define u32toLE(n, p) \
++ (*((p)++)=(unsigned char)(n), \
++ *((p)++)=(unsigned char)(n>>8), \
++ *((p)++)=(unsigned char)(n>>16), \
++ *((p)++)=(unsigned char)(n>>24))
++
+ /* These functions serialize the state of a hash and thus perform the standard
+ * "final" operation without adding the padding and length that such a function
+ * typically does. */
+ static void tls1_md5_final_raw(void* ctx, unsigned char *md_out)
+ {
+ MD5_CTX *md5 = ctx;
+- l2n(md5->A, md_out);
+- l2n(md5->B, md_out);
+- l2n(md5->C, md_out);
+- l2n(md5->D, md_out);
++ u32toLE(md5->A, md_out);
++ u32toLE(md5->B, md_out);
++ u32toLE(md5->C, md_out);
++ u32toLE(md5->D, md_out);
+ }
+
+ static void tls1_sha1_final_raw(void* ctx, unsigned char *md_out)
+@@ -442,6 +453,7 @@ void ssl3_cbc_digest_record(
+ /* mdLengthSize is the number of bytes in the length field that terminates
+ * the hash. */
+ unsigned md_length_size = 8;
++ char length_is_big_endian = 1;
+
+ /* This is a, hopefully redundant, check that allows us to forget about
+ * many possible overflows later in this function. */
+@@ -455,6 +467,7 @@ void ssl3_cbc_digest_record(
+ md_transform = (void(*)(void *ctx, const unsigned char *block)) MD5_Transform;
+ md_size = 16;
+ sslv3_pad_length = 48;
++ length_is_big_endian = 0;
+ break;
+ case NID_sha1:
+ SHA1_Init((SHA_CTX*)md_state.c);
+@@ -595,11 +608,22 @@ void ssl3_cbc_digest_record(
+ md_transform(md_state.c, hmac_pad);
+ }
+
+- memset(length_bytes,0,md_length_size-4);
+- length_bytes[md_length_size-4] = (unsigned char)(bits>>24);
+- length_bytes[md_length_size-3] = (unsigned char)(bits>>16);
+- length_bytes[md_length_size-2] = (unsigned char)(bits>>8);
+- length_bytes[md_length_size-1] = (unsigned char)bits;
++ if (length_is_big_endian)
++ {
++ memset(length_bytes,0,md_length_size-4);
++ length_bytes[md_length_size-4] = (unsigned char)(bits>>24);
++ length_bytes[md_length_size-3] = (unsigned char)(bits>>16);
++ length_bytes[md_length_size-2] = (unsigned char)(bits>>8);
++ length_bytes[md_length_size-1] = (unsigned char)bits;
++ }
++ else
++ {
++ memset(length_bytes,0,md_length_size);
++ length_bytes[md_length_size-5] = (unsigned char)(bits>>24);
++ length_bytes[md_length_size-6] = (unsigned char)(bits>>16);
++ length_bytes[md_length_size-7] = (unsigned char)(bits>>8);
++ length_bytes[md_length_size-8] = (unsigned char)bits;
++ }
+
+ if (k > 0)
+ {
+diff --git a/ssl/s3_clnt.c b/ssl/s3_clnt.c
+index fc53161..f1f9c21 100644
+--- a/ssl/s3_clnt.c
++++ b/ssl/s3_clnt.c
+@@ -888,7 +888,10 @@ int ssl3_get_server_hello(SSL *s)
+ }
+ s->s3->tmp.new_cipher=c;
+ if (!ssl3_digest_cached_records(s))
++ {
++ al = SSL_AD_INTERNAL_ERROR;
+ goto f_err;
++ }
+
+ /* lets get the compression algorithm */
+ /* COMPRESSION */
+@@ -968,7 +971,9 @@ int ssl3_get_server_hello(SSL *s)
+ return(1);
+ f_err:
+ ssl3_send_alert(s,SSL3_AL_FATAL,al);
++#ifndef OPENSSL_NO_TLSEXT
+ err:
++#endif
+ return(-1);
+ }
+
+diff --git a/ssl/s3_enc.c b/ssl/s3_enc.c
+index 76d87b5..6bc0812 100644
+--- a/ssl/s3_enc.c
++++ b/ssl/s3_enc.c
+@@ -697,7 +697,7 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
+ EVP_MD_CTX md_ctx;
+ const EVP_MD_CTX *hash;
+ unsigned char *p,rec_char;
+- size_t md_size;
++ size_t md_size, orig_len;
+ int npad;
+ int t;
+
+@@ -722,6 +722,10 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
+ md_size=t;
+ npad=(48/md_size)*md_size;
+
++ /* kludge: ssl3_cbc_remove_padding passes padding length in rec->type */
++ orig_len = rec->length+md_size+((unsigned int)rec->type>>8);
++ rec->type &= 0xff;
++
+ if (!send &&
+ EVP_CIPHER_CTX_mode(ssl->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+ ssl3_cbc_record_digest_supported(hash))
+@@ -753,7 +757,7 @@ int n_ssl3_mac(SSL *ssl, unsigned char *md, int send)
+ hash,
+ md, &md_size,
+ header, rec->input,
+- rec->length + md_size, rec->orig_len,
++ rec->length + md_size, orig_len,
+ mac_sec, md_size,
+ 1 /* is SSLv3 */);
+ }
+diff --git a/ssl/s3_pkt.c b/ssl/s3_pkt.c
+index 7d8fc53..a41279e 100644
+--- a/ssl/s3_pkt.c
++++ b/ssl/s3_pkt.c
+@@ -289,7 +289,7 @@ static int ssl3_get_record(SSL *s)
+ unsigned char *p;
+ unsigned char md[EVP_MAX_MD_SIZE];
+ short version;
+- unsigned mac_size;
++ unsigned mac_size, orig_len;
+ size_t extra;
+
+ rr= &(s->s3->rrec);
+@@ -397,7 +397,6 @@ fprintf(stderr, "Record type=%d, Length=%d\n", rr->type, rr->length);
+
+ /* decrypt in place in 'rr->input' */
+ rr->data=rr->input;
+- rr->orig_len=rr->length;
+
+ enc_err = s->method->ssl3_enc->enc(s,0);
+ /* enc_err is:
+@@ -428,15 +427,18 @@ printf("\n");
+ mac_size=EVP_MD_CTX_size(s->read_hash);
+ OPENSSL_assert(mac_size <= EVP_MAX_MD_SIZE);
+
++ /* kludge: *_cbc_remove_padding passes padding length in rr->type */
++ orig_len = rr->length+((unsigned int)rr->type>>8);
++
+ /* orig_len is the length of the record before any padding was
+ * removed. This is public information, as is the MAC in use,
+ * therefore we can safely process the record in a different
+ * amount of time if it's too short to possibly contain a MAC.
+ */
+- if (rr->orig_len < mac_size ||
++ if (orig_len < mac_size ||
+ /* CBC records must have a padding length byte too. */
+ (EVP_CIPHER_CTX_mode(s->enc_read_ctx) == EVP_CIPH_CBC_MODE &&
+- rr->orig_len < mac_size+1))
++ orig_len < mac_size+1))
+ {
+ al=SSL_AD_DECODE_ERROR;
+ SSLerr(SSL_F_SSL3_GET_RECORD,SSL_R_LENGTH_TOO_SHORT);
+@@ -451,12 +453,12 @@ printf("\n");
+ * without leaking the contents of the padding bytes.
+ * */
+ mac = mac_tmp;
+- ssl3_cbc_copy_mac(mac_tmp, rr, mac_size);
++ ssl3_cbc_copy_mac(mac_tmp, rr, mac_size, orig_len);
+ rr->length -= mac_size;
+ }
+ else
+ {
+- /* In this case there's no padding, so |rec->orig_len|
++ /* In this case there's no padding, so |orig_len|
+ * equals |rec->length| and we checked that there's
+ * enough bytes for |mac_size| above. */
+ rr->length -= mac_size;
+diff --git a/ssl/s3_srvr.c b/ssl/s3_srvr.c
+index b4a6a37..14aa451 100644
+--- a/ssl/s3_srvr.c
++++ b/ssl/s3_srvr.c
+@@ -1269,7 +1269,10 @@ int ssl3_get_client_hello(SSL *s)
+ }
+
+ if (!ssl3_digest_cached_records(s))
++ {
++ al = SSL_AD_INTERNAL_ERROR;
+ goto f_err;
++ }
+
+ /* we now have the following setup.
+ * client_random
+@@ -1282,6 +1285,7 @@ int ssl3_get_client_hello(SSL *s)
+ * s->tmp.new_cipher - the new cipher to use.
+ */
+
++#ifndef OPENSSL_NO_TLSEXT
+ /* Handles TLS extensions that we couldn't check earlier */
+ if (s->version >= SSL3_VERSION)
+ {
+@@ -1291,6 +1295,7 @@ int ssl3_get_client_hello(SSL *s)
+ goto err;
+ }
+ }
++#endif
+
+ if (ret < 0) ret=1;
+ if (0)
+diff --git a/ssl/ssl3.h b/ssl/ssl3.h
+index d3bd768..9c2c412 100644
+--- a/ssl/ssl3.h
++++ b/ssl/ssl3.h
+@@ -349,10 +349,6 @@ typedef struct ssl3_record_st
+ /*r */ unsigned char *comp; /* only used with decompression - malloc()ed */
+ /*r */ unsigned long epoch; /* epoch number, needed by DTLS1 */
+ /*r */ unsigned char seq_num[8]; /* sequence number, needed by DTLS1 */
+-/*rw*/ unsigned int orig_len; /* How many bytes were available before padding
+- was removed? This is used to implement the
+- MAC check in constant time for CBC records.
+- */
+ } SSL3_RECORD;
+
+ typedef struct ssl3_buffer_st
+diff --git a/ssl/ssl_lib.c b/ssl/ssl_lib.c
+index 25573e4..b3c21ea 100644
+--- a/ssl/ssl_lib.c
++++ b/ssl/ssl_lib.c
+@@ -2601,9 +2601,7 @@ void ssl_clear_cipher_ctx(SSL *s)
+ /* Fix this function so that it takes an optional type parameter */
+ X509 *SSL_get_certificate(const SSL *s)
+ {
+- if (s->server)
+- return(ssl_get_server_send_cert(s));
+- else if (s->cert != NULL)
++ if (s->cert != NULL)
+ return(s->cert->key->x509);
+ else
+ return(NULL);
+diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h
+index 6a4f62a..b0dab18 100644
+--- a/ssl/ssl_locl.h
++++ b/ssl/ssl_locl.h
+@@ -1091,7 +1091,7 @@ int ssl_parse_clienthello_renegotiate_ext(SSL *s, unsigned char *d, int len,
+ /* s3_cbc.c */
+ void ssl3_cbc_copy_mac(unsigned char* out,
+ const SSL3_RECORD *rec,
+- unsigned md_size);
++ unsigned md_size,unsigned orig_len);
+ int ssl3_cbc_remove_padding(const SSL* s,
+ SSL3_RECORD *rec,
+ unsigned block_size,
+diff --git a/ssl/ssltest.c b/ssl/ssltest.c
+index 0bb4fa4..eaad524 100644
+--- a/ssl/ssltest.c
++++ b/ssl/ssltest.c
+@@ -782,7 +782,13 @@ bad:
+ meth=SSLv23_method();
+ #else
+ #ifdef OPENSSL_NO_SSL2
+- meth=SSLv3_method();
++ if (tls1)
++ meth=TLSv1_method();
++ else
++ if (ssl3)
++ meth=SSLv3_method();
++ else
++ meth=SSLv23_method();
+ #else
+ meth=SSLv2_method();
+ #endif
+diff --git a/ssl/t1_enc.c b/ssl/t1_enc.c
+index c38dae6..d67f6f1 100644
+--- a/ssl/t1_enc.c
++++ b/ssl/t1_enc.c
+@@ -851,7 +851,7 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
+ SSL3_RECORD *rec;
+ unsigned char *seq;
+ EVP_MD_CTX *hash;
+- size_t md_size;
++ size_t md_size, orig_len;
+ int i;
+ EVP_MD_CTX hmac, *mac_ctx;
+ unsigned char header[13];
+@@ -898,6 +898,10 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
+ else
+ memcpy(header, seq, 8);
+
++ /* kludge: tls1_cbc_remove_padding passes padding length in rec->type */
++ orig_len = rec->length+md_size+((unsigned int)rec->type>>8);
++ rec->type &= 0xff;
++
+ header[8]=rec->type;
+ header[9]=(unsigned char)(ssl->version>>8);
+ header[10]=(unsigned char)(ssl->version);
+@@ -916,7 +920,7 @@ int tls1_mac(SSL *ssl, unsigned char *md, int send)
+ mac_ctx,
+ md, &md_size,
+ header, rec->input,
+- rec->length + md_size, rec->orig_len,
++ rec->length + md_size, orig_len,
+ ssl->s3->read_mac_secret,
+ ssl->s3->read_mac_secret_size,
+ 0 /* not SSLv3 */);
+diff --git a/test/cms-test.pl b/test/cms-test.pl
+index c938bcf..dfef799 100644
+--- a/test/cms-test.pl
++++ b/test/cms-test.pl
+@@ -415,8 +415,10 @@ sub run_smime_tests {
+ }
+
+ sub cmp_files {
++ use FileHandle;
+ my ( $f1, $f2 ) = @_;
+- my ( $fp1, $fp2 );
++ my $fp1 = FileHandle->new();
++ my $fp2 = FileHandle->new();
+
+ my ( $rd1, $rd2 );
+
+diff --git a/test/testssl b/test/testssl
+index b55364a..04341e9 100644
+--- a/test/testssl
++++ b/test/testssl
+@@ -119,6 +119,23 @@ $ssltest -bio_pair -server_auth -client_auth $CA $extra || exit 1
+ echo test sslv2/sslv3 with both client and server authentication via BIO pair and app verify
+ $ssltest -bio_pair -server_auth -client_auth -app_verify $CA $extra || exit 1
+
++echo "Testing ciphersuites"
++for protocol in SSLv3; do
++ echo "Testing ciphersuites for $protocol"
++ for cipher in `../util/shlib_wrap.sh ../apps/openssl ciphers "RSA+$protocol" | tr ':' ' '`; do
++ echo "Testing $cipher"
++ prot=""
++ if [ $protocol == "SSLv3" ] ; then
++ prot="-ssl3"
++ fi
++ $ssltest -cipher $cipher $prot
++ if [ $? -ne 0 ] ; then
++ echo "Failed $cipher"
++ exit 1
++ fi
++ done
++done
++
+ #############################################################################
+
+ if ../util/shlib_wrap.sh ../apps/openssl no-dh; then
diff --git a/openssl-1.0.0f-fips.patch b/openssl-1.0.0k-fips.patch
similarity index 91%
rename from openssl-1.0.0f-fips.patch
rename to openssl-1.0.0k-fips.patch
index 435d72f..8b53d11 100644
--- a/openssl-1.0.0f-fips.patch
+++ b/openssl-1.0.0k-fips.patch
@@ -1,7 +1,7 @@
-diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
---- openssl-1.0.0f/Configure.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/Configure 2012-01-05 13:22:30.000000000 +0100
-@@ -663,6 +663,7 @@ my $cmll_enc="camellia.o cmll_misc.o cml
+diff -up openssl-1.0.0k/Configure.fips openssl-1.0.0k/Configure
+--- openssl-1.0.0k/Configure.fips 2013-02-19 20:12:54.536663757 +0100
++++ openssl-1.0.0k/Configure 2013-02-19 20:12:54.574664476 +0100
+@@ -664,6 +664,7 @@ my $cmll_enc="camellia.o cmll_misc.o cml
my $processor="";
my $default_ranlib;
my $perl;
@@ -9,7 +9,7 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
# All of the following is disabled by default (RC5 was enabled before 0.9.8):
-@@ -809,6 +810,10 @@ PROCESS_ARGS:
+@@ -810,6 +811,10 @@ PROCESS_ARGS:
}
elsif (/^386$/)
{ $processor=386; }
@@ -20,7 +20,7 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
elsif (/^rsaref$/)
{
# No RSAref support any more since it's not needed.
-@@ -1383,6 +1388,11 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no
+@@ -1386,6 +1391,11 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /-mont/);
@@ -32,7 +32,7 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
$cpuid_obj="mem_clr.o" unless ($cpuid_obj =~ /\.o$/);
$des_obj=$des_enc unless ($des_obj =~ /\.o$/);
$bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/);
-@@ -1550,6 +1560,10 @@ while (<IN>)
+@@ -1553,6 +1563,10 @@ while (<IN>)
s/^LIBKRB5=.*/LIBKRB5=$withargs{"krb5-lib"}/;
s/^LIBZLIB=.*/LIBZLIB=$withargs{"zlib-lib"}/;
s/^ZLIB_INCLUDE=.*/ZLIB_INCLUDE=$withargs{"zlib-include"}/;
@@ -43,9 +43,9 @@ diff -up openssl-1.0.0f/Configure.fips openssl-1.0.0f/Configure
s/^SHLIB_TARGET=.*/SHLIB_TARGET=$shared_target/;
s/^SHLIB_MARK=.*/SHLIB_MARK=$shared_mark/;
s/^SHARED_LIBS=.*/SHARED_LIBS=\$(SHARED_CRYPTO) \$(SHARED_SSL)/ if (!$no_shared);
-diff -up openssl-1.0.0f/crypto/bf/bf_skey.c.fips openssl-1.0.0f/crypto/bf/bf_skey.c
---- openssl-1.0.0f/crypto/bf/bf_skey.c.fips 2008-11-12 04:57:52.000000000 +0100
-+++ openssl-1.0.0f/crypto/bf/bf_skey.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bf/bf_skey.c.fips openssl-1.0.0k/crypto/bf/bf_skey.c
+--- openssl-1.0.0k/crypto/bf/bf_skey.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/bf/bf_skey.c 2013-02-19 20:12:54.574664476 +0100
@@ -59,10 +59,15 @@
#include <stdio.h>
#include <string.h>
@@ -63,9 +63,9 @@ diff -up openssl-1.0.0f/crypto/bf/bf_skey.c.fips openssl-1.0.0f/crypto/bf/bf_ske
{
int i;
BF_LONG *p,ri,in[2];
-diff -up openssl-1.0.0f/crypto/bf/blowfish.h.fips openssl-1.0.0f/crypto/bf/blowfish.h
---- openssl-1.0.0f/crypto/bf/blowfish.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/bf/blowfish.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bf/blowfish.h.fips openssl-1.0.0k/crypto/bf/blowfish.h
+--- openssl-1.0.0k/crypto/bf/blowfish.h.fips 2013-02-19 20:12:53.998653547 +0100
++++ openssl-1.0.0k/crypto/bf/blowfish.h 2013-02-19 20:12:54.575664496 +0100
@@ -104,7 +104,9 @@ typedef struct bf_key_st
BF_LONG S[4*256];
} BF_KEY;
@@ -77,9 +77,9 @@ diff -up openssl-1.0.0f/crypto/bf/blowfish.h.fips openssl-1.0.0f/crypto/bf/blowf
void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
void BF_encrypt(BF_LONG *data,const BF_KEY *key);
-diff -up openssl-1.0.0f/crypto/bn/bn.h.fips openssl-1.0.0f/crypto/bn/bn.h
---- openssl-1.0.0f/crypto/bn/bn.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/bn/bn.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bn/bn.h.fips openssl-1.0.0k/crypto/bn/bn.h
+--- openssl-1.0.0k/crypto/bn/bn.h.fips 2013-02-19 20:12:54.135656147 +0100
++++ openssl-1.0.0k/crypto/bn/bn.h 2013-02-19 20:12:54.575664496 +0100
@@ -558,6 +558,17 @@ int BN_is_prime_ex(const BIGNUM *p,int n
int BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
int do_trial_division, BN_GENCB *cb);
@@ -98,9 +98,9 @@ diff -up openssl-1.0.0f/crypto/bn/bn.h.fips openssl-1.0.0f/crypto/bn/bn.h
BN_MONT_CTX *BN_MONT_CTX_new(void );
void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
-diff -up openssl-1.0.0f/crypto/bn/bn_x931p.c.fips openssl-1.0.0f/crypto/bn/bn_x931p.c
---- openssl-1.0.0f/crypto/bn/bn_x931p.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/bn/bn_x931p.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bn/bn_x931p.c.fips openssl-1.0.0k/crypto/bn/bn_x931p.c
+--- openssl-1.0.0k/crypto/bn/bn_x931p.c.fips 2013-02-19 20:12:54.575664496 +0100
++++ openssl-1.0.0k/crypto/bn/bn_x931p.c 2013-02-19 20:12:54.576664516 +0100
@@ -0,0 +1,272 @@
+/* bn_x931p.c */
+/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -374,9 +374,9 @@ diff -up openssl-1.0.0f/crypto/bn/bn_x931p.c.fips openssl-1.0.0f/crypto/bn/bn_x9
+
+ }
+
-diff -up openssl-1.0.0f/crypto/bn/Makefile.fips openssl-1.0.0f/crypto/bn/Makefile
---- openssl-1.0.0f/crypto/bn/Makefile.fips 2008-11-12 09:19:02.000000000 +0100
-+++ openssl-1.0.0f/crypto/bn/Makefile 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/bn/Makefile.fips openssl-1.0.0k/crypto/bn/Makefile
+--- openssl-1.0.0k/crypto/bn/Makefile.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/bn/Makefile 2013-02-19 20:12:54.576664516 +0100
@@ -26,13 +26,13 @@ LIBSRC= bn_add.c bn_div.c bn_exp.c bn_li
bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \
bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c bn_asm.c \
@@ -393,9 +393,9 @@ diff -up openssl-1.0.0f/crypto/bn/Makefile.fips openssl-1.0.0f/crypto/bn/Makefil
SRC= $(LIBSRC)
-diff -up openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl.fips openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl
---- openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl.fips 2009-04-06 16:25:02.000000000 +0200
-+++ openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl.fips openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl
+--- openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/camellia/asm/cmll-x86.pl 2013-02-19 20:12:54.576664516 +0100
@@ -722,12 +722,15 @@ my $bias=int(@T[0])?shift(@T):0;
}
&function_end("Camellia_Ekeygen");
@@ -422,10 +422,10 @@ diff -up openssl-1.0.0f/crypto/camellia/asm/cmll-x86.pl.fips openssl-1.0.0f/cryp
}
@SBOX=(
-diff -up openssl-1.0.0f/crypto/camellia/camellia.h.fips openssl-1.0.0f/crypto/camellia/camellia.h
---- openssl-1.0.0f/crypto/camellia/camellia.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/camellia.h 2012-01-05 13:22:30.000000000 +0100
-@@ -88,6 +88,11 @@ struct camellia_key_st
+diff -up openssl-1.0.0k/crypto/camellia/camellia.h.fips openssl-1.0.0k/crypto/camellia/camellia.h
+--- openssl-1.0.0k/crypto/camellia/camellia.h.fips 2013-02-19 20:12:53.926652181 +0100
++++ openssl-1.0.0k/crypto/camellia/camellia.h 2013-02-19 20:12:54.577664536 +0100
+@@ -88,6 +88,11 @@ struct camellia_key_st
};
typedef struct camellia_key_st CAMELLIA_KEY;
@@ -437,9 +437,9 @@ diff -up openssl-1.0.0f/crypto/camellia/camellia.h.fips openssl-1.0.0f/crypto/ca
int Camellia_set_key(const unsigned char *userKey, const int bits,
CAMELLIA_KEY *key);
-diff -up openssl-1.0.0f/crypto/camellia/cmll_fblk.c.fips openssl-1.0.0f/crypto/camellia/cmll_fblk.c
---- openssl-1.0.0f/crypto/camellia/cmll_fblk.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/cmll_fblk.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/cmll_fblk.c.fips openssl-1.0.0k/crypto/camellia/cmll_fblk.c
+--- openssl-1.0.0k/crypto/camellia/cmll_fblk.c.fips 2013-02-19 20:12:54.577664536 +0100
++++ openssl-1.0.0k/crypto/camellia/cmll_fblk.c 2013-02-19 20:12:54.577664536 +0100
@@ -0,0 +1,68 @@
+/* crypto/camellia/camellia_misc.c -*- mode:C; c-file-style: "eay" -*- */
+/* ====================================================================
@@ -509,9 +509,9 @@ diff -up openssl-1.0.0f/crypto/camellia/cmll_fblk.c.fips openssl-1.0.0f/crypto/c
+ return private_Camellia_set_key(userKey, bits, key);
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/camellia/cmll_misc.c.fips openssl-1.0.0f/crypto/camellia/cmll_misc.c
---- openssl-1.0.0f/crypto/camellia/cmll_misc.c.fips 2008-10-28 13:13:52.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/cmll_misc.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/cmll_misc.c.fips openssl-1.0.0k/crypto/camellia/cmll_misc.c
+--- openssl-1.0.0k/crypto/camellia/cmll_misc.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/camellia/cmll_misc.c 2013-02-19 20:12:54.577664536 +0100
@@ -52,11 +52,20 @@
#include <openssl/opensslv.h>
#include <openssl/camellia.h>
@@ -533,9 +533,9 @@ diff -up openssl-1.0.0f/crypto/camellia/cmll_misc.c.fips openssl-1.0.0f/crypto/c
{
if(!userKey || !key)
return -1;
-diff -up openssl-1.0.0f/crypto/camellia/Makefile.fips openssl-1.0.0f/crypto/camellia/Makefile
---- openssl-1.0.0f/crypto/camellia/Makefile.fips 2008-12-23 12:33:00.000000000 +0100
-+++ openssl-1.0.0f/crypto/camellia/Makefile 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/camellia/Makefile.fips openssl-1.0.0k/crypto/camellia/Makefile
+--- openssl-1.0.0k/crypto/camellia/Makefile.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/camellia/Makefile 2013-02-19 20:12:54.578664555 +0100
@@ -23,9 +23,9 @@ APPS=
LIB=$(TOP)/libcrypto.a
@@ -548,9 +548,9 @@ diff -up openssl-1.0.0f/crypto/camellia/Makefile.fips openssl-1.0.0f/crypto/came
SRC= $(LIBSRC)
-diff -up openssl-1.0.0f/crypto/cast/cast.h.fips openssl-1.0.0f/crypto/cast/cast.h
---- openssl-1.0.0f/crypto/cast/cast.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/cast/cast.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/cast/cast.h.fips openssl-1.0.0k/crypto/cast/cast.h
+--- openssl-1.0.0k/crypto/cast/cast.h.fips 2013-02-19 20:12:54.363660475 +0100
++++ openssl-1.0.0k/crypto/cast/cast.h 2013-02-19 20:12:54.578664555 +0100
@@ -83,7 +83,9 @@ typedef struct cast_key_st
int short_key; /* Use reduced rounds for short key */
} CAST_KEY;
@@ -562,9 +562,9 @@ diff -up openssl-1.0.0f/crypto/cast/cast.h.fips openssl-1.0.0f/crypto/cast/cast.
void CAST_set_key(CAST_KEY *key, int len, const unsigned char *data);
void CAST_ecb_encrypt(const unsigned char *in, unsigned char *out, const CAST_KEY *key,
int enc);
-diff -up openssl-1.0.0f/crypto/cast/c_skey.c.fips openssl-1.0.0f/crypto/cast/c_skey.c
---- openssl-1.0.0f/crypto/cast/c_skey.c.fips 2000-06-03 16:13:35.000000000 +0200
-+++ openssl-1.0.0f/crypto/cast/c_skey.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/cast/c_skey.c.fips openssl-1.0.0k/crypto/cast/c_skey.c
+--- openssl-1.0.0k/crypto/cast/c_skey.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/cast/c_skey.c 2013-02-19 20:12:54.578664555 +0100
@@ -57,6 +57,11 @@
*/
@@ -586,12 +586,12 @@ diff -up openssl-1.0.0f/crypto/cast/c_skey.c.fips openssl-1.0.0f/crypto/cast/c_s
{
CAST_LONG x[16];
CAST_LONG z[16];
-diff -up openssl-1.0.0f/crypto/crypto.h.fips openssl-1.0.0f/crypto/crypto.h
---- openssl-1.0.0f/crypto/crypto.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/crypto.h 2012-01-05 13:22:30.000000000 +0100
-@@ -547,12 +547,70 @@ unsigned long *OPENSSL_ia32cap_loc(void)
- #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
- int OPENSSL_isservice(void);
+diff -up openssl-1.0.0k/crypto/crypto.h.fips openssl-1.0.0k/crypto/crypto.h
+--- openssl-1.0.0k/crypto/crypto.h.fips 2013-02-19 20:12:54.000000000 +0100
++++ openssl-1.0.0k/crypto/crypto.h 2013-02-19 20:14:08.209061781 +0100
+@@ -554,12 +554,70 @@ int OPENSSL_isservice(void);
+ * non-zero. */
+ int CRYPTO_memcmp(const void *a, const void *b, size_t len);
+
+#ifdef OPENSSL_FIPS
@@ -660,9 +660,9 @@ diff -up openssl-1.0.0f/crypto/crypto.h.fips openssl-1.0.0f/crypto/crypto.h
/* Error codes for the CRYPTO functions. */
/* Function codes. */
-diff -up openssl-1.0.0f/crypto/dh/dh_err.c.fips openssl-1.0.0f/crypto/dh/dh_err.c
---- openssl-1.0.0f/crypto/dh/dh_err.c.fips 2006-11-21 22:29:37.000000000 +0100
-+++ openssl-1.0.0f/crypto/dh/dh_err.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh_err.c.fips openssl-1.0.0k/crypto/dh/dh_err.c
+--- openssl-1.0.0k/crypto/dh/dh_err.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dh/dh_err.c 2013-02-19 20:12:54.579664573 +0100
@@ -73,6 +73,8 @@ static ERR_STRING_DATA DH_str_functs[]=
{ERR_FUNC(DH_F_COMPUTE_KEY), "COMPUTE_KEY"},
{ERR_FUNC(DH_F_DHPARAMS_PRINT_FP), "DHparams_print_fp"},
@@ -680,9 +680,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh_err.c.fips openssl-1.0.0f/crypto/dh/dh_err.
{ERR_REASON(DH_R_KEYS_NOT_SET) ,"keys not set"},
{ERR_REASON(DH_R_MODULUS_TOO_LARGE) ,"modulus too large"},
{ERR_REASON(DH_R_NO_PARAMETERS_SET) ,"no parameters set"},
-diff -up openssl-1.0.0f/crypto/dh/dh_gen.c.fips openssl-1.0.0f/crypto/dh/dh_gen.c
---- openssl-1.0.0f/crypto/dh/dh_gen.c.fips 2005-04-26 20:53:15.000000000 +0200
-+++ openssl-1.0.0f/crypto/dh/dh_gen.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh_gen.c.fips openssl-1.0.0k/crypto/dh/dh_gen.c
+--- openssl-1.0.0k/crypto/dh/dh_gen.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dh/dh_gen.c 2013-02-19 20:12:54.579664573 +0100
@@ -65,6 +65,10 @@
#include "cryptlib.h"
#include <openssl/bn.h>
@@ -715,9 +715,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh_gen.c.fips openssl-1.0.0f/crypto/dh/dh_gen.
ctx=BN_CTX_new();
if (ctx == NULL) goto err;
BN_CTX_start(ctx);
-diff -up openssl-1.0.0f/crypto/dh/dh.h.fips openssl-1.0.0f/crypto/dh/dh.h
---- openssl-1.0.0f/crypto/dh/dh.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/dh/dh.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh.h.fips openssl-1.0.0k/crypto/dh/dh.h
+--- openssl-1.0.0k/crypto/dh/dh.h.fips 2013-02-19 20:12:54.259658499 +0100
++++ openssl-1.0.0k/crypto/dh/dh.h 2013-02-19 20:12:54.580664592 +0100
@@ -77,6 +77,8 @@
# define OPENSSL_DH_MAX_MODULUS_BITS 10000
#endif
@@ -744,9 +744,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh.h.fips openssl-1.0.0f/crypto/dh/dh.h
#ifdef __cplusplus
}
-diff -up openssl-1.0.0f/crypto/dh/dh_key.c.fips openssl-1.0.0f/crypto/dh/dh_key.c
---- openssl-1.0.0f/crypto/dh/dh_key.c.fips 2007-03-28 02:15:23.000000000 +0200
-+++ openssl-1.0.0f/crypto/dh/dh_key.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dh/dh_key.c.fips openssl-1.0.0k/crypto/dh/dh_key.c
+--- openssl-1.0.0k/crypto/dh/dh_key.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dh/dh_key.c 2013-02-19 20:12:54.580664592 +0100
@@ -61,6 +61,9 @@
#include <openssl/bn.h>
#include <openssl/rand.h>
@@ -796,9 +796,9 @@ diff -up openssl-1.0.0f/crypto/dh/dh_key.c.fips openssl-1.0.0f/crypto/dh/dh_key.
dh->flags |= DH_FLAG_CACHE_MONT_P;
return(1);
}
-diff -up openssl-1.0.0f/crypto/dsa/dsa_gen.c.fips openssl-1.0.0f/crypto/dsa/dsa_gen.c
---- openssl-1.0.0f/crypto/dsa/dsa_gen.c.fips 2010-06-15 19:25:07.000000000 +0200
-+++ openssl-1.0.0f/crypto/dsa/dsa_gen.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa_gen.c.fips openssl-1.0.0k/crypto/dsa/dsa_gen.c
+--- openssl-1.0.0k/crypto/dsa/dsa_gen.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa_gen.c 2013-02-19 20:12:54.580664592 +0100
@@ -77,8 +77,12 @@
#include "cryptlib.h"
#include <openssl/evp.h>
@@ -834,9 +834,9 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_gen.c.fips openssl-1.0.0f/crypto/dsa/dsa_
if (qsize != SHA_DIGEST_LENGTH && qsize != SHA224_DIGEST_LENGTH &&
qsize != SHA256_DIGEST_LENGTH)
/* invalid q size */
-diff -up openssl-1.0.0f/crypto/dsa/dsa.h.fips openssl-1.0.0f/crypto/dsa/dsa.h
---- openssl-1.0.0f/crypto/dsa/dsa.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/dsa/dsa.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa.h.fips openssl-1.0.0k/crypto/dsa/dsa.h
+--- openssl-1.0.0k/crypto/dsa/dsa.h.fips 2013-02-19 20:12:54.099655464 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa.h 2013-02-19 20:12:54.581664610 +0100
@@ -88,6 +88,8 @@
# define OPENSSL_DSA_MAX_MODULUS_BITS 10000
#endif
@@ -893,9 +893,9 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa.h.fips openssl-1.0.0f/crypto/dsa/dsa.h
#define DSA_R_PARAMETER_ENCODING_ERROR 105
#ifdef __cplusplus
-diff -up openssl-1.0.0f/crypto/dsa/dsa_key.c.fips openssl-1.0.0f/crypto/dsa/dsa_key.c
---- openssl-1.0.0f/crypto/dsa/dsa_key.c.fips 2007-03-28 02:15:25.000000000 +0200
-+++ openssl-1.0.0f/crypto/dsa/dsa_key.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa_key.c.fips openssl-1.0.0k/crypto/dsa/dsa_key.c
+--- openssl-1.0.0k/crypto/dsa/dsa_key.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa_key.c 2013-02-19 20:12:54.581664610 +0100
@@ -63,9 +63,55 @@
#include <openssl/bn.h>
#include <openssl/dsa.h>
@@ -983,9 +983,9 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_key.c.fips openssl-1.0.0f/crypto/dsa/dsa_
ok=1;
err:
-diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa_ossl.c
---- openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips 2011-02-01 13:54:04.000000000 +0100
-+++ openssl-1.0.0f/crypto/dsa/dsa_ossl.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0k/crypto/dsa/dsa_ossl.c
+--- openssl-1.0.0k/crypto/dsa/dsa_ossl.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/dsa/dsa_ossl.c 2013-02-19 20:12:54.582664628 +0100
@@ -65,6 +65,9 @@
#include <openssl/dsa.h>
#include <openssl/rand.h>
@@ -1026,7 +1026,7 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa
BN_init(&m);
BN_init(&xr);
-@@ -303,6 +320,20 @@ static int dsa_do_verify(const unsigned
+@@ -303,6 +320,20 @@ static int dsa_do_verify(const unsigned
return -1;
}
@@ -1047,7 +1047,7 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa
i = BN_num_bits(dsa->q);
/* fips 186-3 allows only different sizes for q */
if (i != 160 && i != 224 && i != 256)
-@@ -385,6 +416,9 @@ static int dsa_do_verify(const unsigned
+@@ -385,6 +416,9 @@ static int dsa_do_verify(const unsigned
static int dsa_init(DSA *dsa)
{
@@ -1057,10 +1057,10 @@ diff -up openssl-1.0.0f/crypto/dsa/dsa_ossl.c.fips openssl-1.0.0f/crypto/dsa/dsa
dsa->flags|=DSA_FLAG_CACHE_MONT_P;
return(1);
}
-diff -up openssl-1.0.0f/crypto/err/err_all.c.fips openssl-1.0.0f/crypto/err/err_all.c
---- openssl-1.0.0f/crypto/err/err_all.c.fips 2009-08-09 16:58:05.000000000 +0200
-+++ openssl-1.0.0f/crypto/err/err_all.c 2012-01-05 13:22:30.000000000 +0100
-@@ -96,6 +96,9 @@
+diff -up openssl-1.0.0k/crypto/err/err_all.c.fips openssl-1.0.0k/crypto/err/err_all.c
+--- openssl-1.0.0k/crypto/err/err_all.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/err/err_all.c 2013-02-19 20:12:54.582664628 +0100
+@@ -98,6 +98,9 @@
#include <openssl/ocsp.h>
#include <openssl/err.h>
#include <openssl/ts.h>
@@ -1070,7 +1070,7 @@ diff -up openssl-1.0.0f/crypto/err/err_all.c.fips openssl-1.0.0f/crypto/err/err_
#ifndef OPENSSL_NO_CMS
#include <openssl/cms.h>
#endif
-@@ -149,6 +152,9 @@ void ERR_load_crypto_strings(void)
+@@ -152,6 +155,9 @@ void ERR_load_crypto_strings(void)
#endif
ERR_load_OCSP_strings();
ERR_load_UI_strings();
@@ -1080,9 +1080,9 @@ diff -up openssl-1.0.0f/crypto/err/err_all.c.fips openssl-1.0.0f/crypto/err/err_
#ifndef OPENSSL_NO_CMS
ERR_load_CMS_strings();
#endif
-diff -up openssl-1.0.0f/crypto/evp/digest.c.fips openssl-1.0.0f/crypto/evp/digest.c
---- openssl-1.0.0f/crypto/evp/digest.c.fips 2010-03-05 14:33:43.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/digest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/digest.c.fips openssl-1.0.0k/crypto/evp/digest.c
+--- openssl-1.0.0k/crypto/evp/digest.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/digest.c 2013-02-19 20:12:54.582664628 +0100
@@ -116,6 +116,7 @@
#ifndef OPENSSL_NO_ENGINE
#include <openssl/engine.h>
@@ -1181,9 +1181,9 @@ diff -up openssl-1.0.0f/crypto/evp/digest.c.fips openssl-1.0.0f/crypto/evp/diges
OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
ret=ctx->digest->final(ctx,md);
-diff -up openssl-1.0.0f/crypto/evp/e_aes.c.fips openssl-1.0.0f/crypto/evp/e_aes.c
---- openssl-1.0.0f/crypto/evp/e_aes.c.fips 2004-01-28 20:05:33.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_aes.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_aes.c.fips openssl-1.0.0k/crypto/evp/e_aes.c
+--- openssl-1.0.0k/crypto/evp/e_aes.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_aes.c 2013-02-19 20:12:54.583664647 +0100
@@ -69,32 +69,29 @@ typedef struct
IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
@@ -1210,35 +1210,32 @@ diff -up openssl-1.0.0f/crypto/evp/e_aes.c.fips openssl-1.0.0f/crypto/evp/e_aes.
- EVP_CIPHER_set_asn1_iv,
- EVP_CIPHER_get_asn1_iv,
- NULL)
--
--#define IMPLEMENT_AES_CFBR(ksize,cbits) IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
--
--IMPLEMENT_AES_CFBR(128,1)
--IMPLEMENT_AES_CFBR(192,1)
--IMPLEMENT_AES_CFBR(256,1)
--
--IMPLEMENT_AES_CFBR(128,8)
--IMPLEMENT_AES_CFBR(192,8)
--IMPLEMENT_AES_CFBR(256,8)
+ EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_DEFAULT_ASN1,
+ aes_init_key,
+ NULL, NULL, NULL, NULL)
-+
+
+-#define IMPLEMENT_AES_CFBR(ksize,cbits) IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
+#define IMPLEMENT_AES_CFBR(ksize,cbits,flags) IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16,flags)
-+
+
+-IMPLEMENT_AES_CFBR(128,1)
+-IMPLEMENT_AES_CFBR(192,1)
+-IMPLEMENT_AES_CFBR(256,1)
+IMPLEMENT_AES_CFBR(128,1,EVP_CIPH_FLAG_FIPS)
+IMPLEMENT_AES_CFBR(192,1,EVP_CIPH_FLAG_FIPS)
+IMPLEMENT_AES_CFBR(256,1,EVP_CIPH_FLAG_FIPS)
-+
+
+-IMPLEMENT_AES_CFBR(128,8)
+-IMPLEMENT_AES_CFBR(192,8)
+-IMPLEMENT_AES_CFBR(256,8)
+IMPLEMENT_AES_CFBR(128,8,EVP_CIPH_FLAG_FIPS)
+IMPLEMENT_AES_CFBR(192,8,EVP_CIPH_FLAG_FIPS)
+IMPLEMENT_AES_CFBR(256,8,EVP_CIPH_FLAG_FIPS)
static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc)
-diff -up openssl-1.0.0f/crypto/evp/e_camellia.c.fips openssl-1.0.0f/crypto/evp/e_camellia.c
---- openssl-1.0.0f/crypto/evp/e_camellia.c.fips 2006-08-31 22:56:20.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/e_camellia.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_camellia.c.fips openssl-1.0.0k/crypto/evp/e_camellia.c
+--- openssl-1.0.0k/crypto/evp/e_camellia.c.fips 2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_camellia.c 2013-02-19 20:12:54.583664647 +0100
@@ -93,7 +93,7 @@ IMPLEMENT_BLOCK_CIPHER(camellia_256, ks,
EVP_CIPHER_get_asn1_iv,
NULL)
@@ -1248,9 +1245,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_camellia.c.fips openssl-1.0.0f/crypto/evp/e
IMPLEMENT_CAMELLIA_CFBR(128,1)
IMPLEMENT_CAMELLIA_CFBR(192,1)
-diff -up openssl-1.0.0f/crypto/evp/e_des3.c.fips openssl-1.0.0f/crypto/evp/e_des3.c
---- openssl-1.0.0f/crypto/evp/e_des3.c.fips 2008-12-29 13:35:47.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_des3.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_des3.c.fips openssl-1.0.0k/crypto/evp/e_des3.c
+--- openssl-1.0.0k/crypto/evp/e_des3.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_des3.c 2013-02-19 20:12:54.583664647 +0100
@@ -206,9 +206,9 @@ static int des_ede3_cfb8_cipher(EVP_CIPH
}
@@ -1264,7 +1261,7 @@ diff -up openssl-1.0.0f/crypto/evp/e_des3.c.fips openssl-1.0.0f/crypto/evp/e_des
des3_ctrl)
#define des_ede3_cfb64_cipher des_ede_cfb64_cipher
-@@ -217,21 +217,21 @@ BLOCK_CIPHER_defs(des_ede, DES_EDE_KEY,
+@@ -217,21 +217,21 @@ BLOCK_CIPHER_defs(des_ede, DES_EDE_KEY,
#define des_ede3_ecb_cipher des_ede_ecb_cipher
BLOCK_CIPHER_defs(des_ede3, DES_EDE_KEY, NID_des_ede3, 8, 24, 8, 64,
@@ -1295,9 +1292,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_des3.c.fips openssl-1.0.0f/crypto/evp/e_des
des3_ctrl)
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-diff -up openssl-1.0.0f/crypto/evp/e_null.c.fips openssl-1.0.0f/crypto/evp/e_null.c
---- openssl-1.0.0f/crypto/evp/e_null.c.fips 2008-10-31 20:48:24.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_null.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_null.c.fips openssl-1.0.0k/crypto/evp/e_null.c
+--- openssl-1.0.0k/crypto/evp/e_null.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_null.c 2013-02-19 20:12:54.584664666 +0100
@@ -69,7 +69,7 @@ static const EVP_CIPHER n_cipher=
{
NID_undef,
@@ -1307,9 +1304,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_null.c.fips openssl-1.0.0f/crypto/evp/e_nul
null_init_key,
null_cipher,
NULL,
-diff -up openssl-1.0.0f/crypto/evp/e_rc4.c.fips openssl-1.0.0f/crypto/evp/e_rc4.c
---- openssl-1.0.0f/crypto/evp/e_rc4.c.fips 2008-10-31 20:48:24.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/e_rc4.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/e_rc4.c.fips openssl-1.0.0k/crypto/evp/e_rc4.c
+--- openssl-1.0.0k/crypto/evp/e_rc4.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/e_rc4.c 2013-02-19 20:12:54.584664666 +0100
@@ -64,6 +64,7 @@
#include <openssl/evp.h>
#include <openssl/objects.h>
@@ -1318,9 +1315,9 @@ diff -up openssl-1.0.0f/crypto/evp/e_rc4.c.fips openssl-1.0.0f/crypto/evp/e_rc4.
/* FIXME: surely this is available elsewhere? */
#define EVP_RC4_KEY_SIZE 16
-diff -up openssl-1.0.0f/crypto/evp/evp_enc.c.fips openssl-1.0.0f/crypto/evp/evp_enc.c
---- openssl-1.0.0f/crypto/evp/evp_enc.c.fips 2010-10-12 01:24:49.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/evp_enc.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_enc.c.fips openssl-1.0.0k/crypto/evp/evp_enc.c
+--- openssl-1.0.0k/crypto/evp/evp_enc.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/evp_enc.c 2013-02-19 20:12:54.584664666 +0100
@@ -68,8 +68,53 @@
const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT;
@@ -1413,9 +1410,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp_enc.c.fips openssl-1.0.0f/crypto/evp/evp_
if(key || (ctx->cipher->flags & EVP_CIPH_ALWAYS_CALL_INIT)) {
if(!ctx->cipher->init(ctx,key,iv,enc)) return 0;
}
-diff -up openssl-1.0.0f/crypto/evp/evp_err.c.fips openssl-1.0.0f/crypto/evp/evp_err.c
---- openssl-1.0.0f/crypto/evp/evp_err.c.fips 2010-02-07 14:41:23.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp_err.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_err.c.fips openssl-1.0.0k/crypto/evp/evp_err.c
+--- openssl-1.0.0k/crypto/evp/evp_err.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/evp_err.c 2013-02-19 20:12:54.585664685 +0100
@@ -155,6 +155,7 @@ static ERR_STRING_DATA EVP_str_reasons[]
{ERR_REASON(EVP_R_DECODE_ERROR) ,"decode error"},
{ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES) ,"different key types"},
@@ -1424,9 +1421,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp_err.c.fips openssl-1.0.0f/crypto/evp/evp_
{ERR_REASON(EVP_R_ENCODE_ERROR) ,"encode error"},
{ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"},
{ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY) ,"expecting an rsa key"},
-diff -up openssl-1.0.0f/crypto/evp/evp.h.fips openssl-1.0.0f/crypto/evp/evp.h
---- openssl-1.0.0f/crypto/evp/evp.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp.h.fips openssl-1.0.0k/crypto/evp/evp.h
+--- openssl-1.0.0k/crypto/evp/evp.h.fips 2013-02-19 20:12:54.344660112 +0100
++++ openssl-1.0.0k/crypto/evp/evp.h 2013-02-19 20:12:54.585664685 +0100
@@ -75,6 +75,10 @@
#include <openssl/bio.h>
#endif
@@ -1496,9 +1493,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp.h.fips openssl-1.0.0f/crypto/evp/evp.h
#define EVP_R_ENCODE_ERROR 115
#define EVP_R_EVP_PBE_CIPHERINIT_ERROR 119
#define EVP_R_EXPECTING_AN_RSA_KEY 127
-diff -up openssl-1.0.0f/crypto/evp/evp_lib.c.fips openssl-1.0.0f/crypto/evp/evp_lib.c
---- openssl-1.0.0f/crypto/evp/evp_lib.c.fips 2010-01-26 15:33:51.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp_lib.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_lib.c.fips openssl-1.0.0k/crypto/evp/evp_lib.c
+--- openssl-1.0.0k/crypto/evp/evp_lib.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/evp_lib.c 2013-02-19 20:12:54.586664704 +0100
@@ -67,6 +67,8 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_
if (c->cipher->set_asn1_parameters != NULL)
@@ -1527,9 +1524,9 @@ diff -up openssl-1.0.0f/crypto/evp/evp_lib.c.fips openssl-1.0.0f/crypto/evp/evp_
return ctx->cipher->do_cipher(ctx,out,in,inl);
}
-diff -up openssl-1.0.0f/crypto/evp/evp_locl.h.fips openssl-1.0.0f/crypto/evp/evp_locl.h
---- openssl-1.0.0f/crypto/evp/evp_locl.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/evp_locl.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/evp_locl.h.fips openssl-1.0.0k/crypto/evp/evp_locl.h
+--- openssl-1.0.0k/crypto/evp/evp_locl.h.fips 2013-02-19 20:12:54.335659942 +0100
++++ openssl-1.0.0k/crypto/evp/evp_locl.h 2013-02-19 20:12:54.586664704 +0100
@@ -254,14 +254,32 @@ const EVP_CIPHER *EVP_##cname##_ecb(void
#define EVP_C_DATA(kstruct, ctx) ((kstruct *)(ctx)->cipher_data)
@@ -1568,33 +1565,33 @@ diff -up openssl-1.0.0f/crypto/evp/evp_locl.h.fips openssl-1.0.0f/crypto/evp/evp
struct evp_pkey_ctx_st
{
-diff -up openssl-1.0.0f/crypto/evp/m_dss.c.fips openssl-1.0.0f/crypto/evp/m_dss.c
---- openssl-1.0.0f/crypto/evp/m_dss.c.fips 2006-04-19 19:05:57.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_dss.c 2012-01-05 13:22:30.000000000 +0100
-@@ -81,7 +81,7 @@ static const EVP_MD dsa_md=
- NID_dsaWithSHA,
- NID_dsaWithSHA,
+diff -up openssl-1.0.0k/crypto/evp/m_dss1.c.fips openssl-1.0.0k/crypto/evp/m_dss1.c
+--- openssl-1.0.0k/crypto/evp/m_dss1.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_dss1.c 2013-02-19 20:12:54.587664724 +0100
+@@ -82,7 +82,7 @@ static const EVP_MD dss1_md=
+ NID_dsa,
+ NID_dsaWithSHA1,
SHA_DIGEST_LENGTH,
- EVP_MD_FLAG_PKEY_DIGEST,
+ EVP_MD_FLAG_PKEY_DIGEST|EVP_MD_FLAG_FIPS,
init,
update,
final,
-diff -up openssl-1.0.0f/crypto/evp/m_dss1.c.fips openssl-1.0.0f/crypto/evp/m_dss1.c
---- openssl-1.0.0f/crypto/evp/m_dss1.c.fips 2006-04-19 19:05:57.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_dss1.c 2012-01-05 13:22:30.000000000 +0100
-@@ -82,7 +82,7 @@ static const EVP_MD dss1_md=
- NID_dsa,
- NID_dsaWithSHA1,
+diff -up openssl-1.0.0k/crypto/evp/m_dss.c.fips openssl-1.0.0k/crypto/evp/m_dss.c
+--- openssl-1.0.0k/crypto/evp/m_dss.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_dss.c 2013-02-19 20:12:54.587664724 +0100
+@@ -81,7 +81,7 @@ static const EVP_MD dsa_md=
+ NID_dsaWithSHA,
+ NID_dsaWithSHA,
SHA_DIGEST_LENGTH,
- EVP_MD_FLAG_PKEY_DIGEST,
+ EVP_MD_FLAG_PKEY_DIGEST|EVP_MD_FLAG_FIPS,
init,
update,
final,
-diff -up openssl-1.0.0f/crypto/evp/m_mdc2.c.fips openssl-1.0.0f/crypto/evp/m_mdc2.c
---- openssl-1.0.0f/crypto/evp/m_mdc2.c.fips 2010-02-02 14:36:05.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/m_mdc2.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/m_md2.c.fips openssl-1.0.0k/crypto/evp/m_md2.c
+--- openssl-1.0.0k/crypto/evp/m_md2.c.fips 2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_md2.c 2013-02-19 20:12:54.587664724 +0100
@@ -68,6 +68,7 @@
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
@@ -1602,10 +1599,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_mdc2.c.fips openssl-1.0.0f/crypto/evp/m_mdc
+#include "evp_locl.h"
static int init(EVP_MD_CTX *ctx)
- { return MDC2_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_md2.c.fips openssl-1.0.0f/crypto/evp/m_md2.c
---- openssl-1.0.0f/crypto/evp/m_md2.c.fips 2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_md2.c 2012-01-05 13:22:30.000000000 +0100
+ { return MD2_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_md4.c.fips openssl-1.0.0k/crypto/evp/m_md4.c
+--- openssl-1.0.0k/crypto/evp/m_md4.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_md4.c 2013-02-19 20:12:54.588664743 +0100
@@ -68,6 +68,7 @@
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
@@ -1613,10 +1610,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_md2.c.fips openssl-1.0.0f/crypto/evp/m_md2.
+#include "evp_locl.h"
static int init(EVP_MD_CTX *ctx)
- { return MD2_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_md4.c.fips openssl-1.0.0f/crypto/evp/m_md4.c
---- openssl-1.0.0f/crypto/evp/m_md4.c.fips 2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_md4.c 2012-01-05 13:22:30.000000000 +0100
+ { return MD4_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_md5.c.fips openssl-1.0.0k/crypto/evp/m_md5.c
+--- openssl-1.0.0k/crypto/evp/m_md5.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_md5.c 2013-02-19 20:12:54.588664743 +0100
@@ -68,6 +68,7 @@
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
@@ -1624,10 +1621,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_md4.c.fips openssl-1.0.0f/crypto/evp/m_md4.
+#include "evp_locl.h"
static int init(EVP_MD_CTX *ctx)
- { return MD4_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_md5.c.fips openssl-1.0.0f/crypto/evp/m_md5.c
---- openssl-1.0.0f/crypto/evp/m_md5.c.fips 2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_md5.c 2012-01-05 13:22:30.000000000 +0100
+ { return MD5_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_mdc2.c.fips openssl-1.0.0k/crypto/evp/m_mdc2.c
+--- openssl-1.0.0k/crypto/evp/m_mdc2.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_mdc2.c 2013-02-19 20:12:54.587664724 +0100
@@ -68,6 +68,7 @@
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
@@ -1635,10 +1632,10 @@ diff -up openssl-1.0.0f/crypto/evp/m_md5.c.fips openssl-1.0.0f/crypto/evp/m_md5.
+#include "evp_locl.h"
static int init(EVP_MD_CTX *ctx)
- { return MD5_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_ripemd.c.fips openssl-1.0.0f/crypto/evp/m_ripemd.c
---- openssl-1.0.0f/crypto/evp/m_ripemd.c.fips 2005-07-16 14:37:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/evp/m_ripemd.c 2012-01-05 13:22:30.000000000 +0100
+ { return MDC2_Init(ctx->md_data); }
+diff -up openssl-1.0.0k/crypto/evp/m_ripemd.c.fips openssl-1.0.0k/crypto/evp/m_ripemd.c
+--- openssl-1.0.0k/crypto/evp/m_ripemd.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_ripemd.c 2013-02-19 20:12:54.588664743 +0100
@@ -68,6 +68,7 @@
#ifndef OPENSSL_NO_RSA
#include <openssl/rsa.h>
@@ -1647,9 +1644,9 @@ diff -up openssl-1.0.0f/crypto/evp/m_ripemd.c.fips openssl-1.0.0f/crypto/evp/m_r
static int init(EVP_MD_CTX *ctx)
{ return RIPEMD160_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/m_sha1.c.fips openssl-1.0.0f/crypto/evp/m_sha1.c
---- openssl-1.0.0f/crypto/evp/m_sha1.c.fips 2008-03-12 22:14:24.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/m_sha1.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/m_sha1.c.fips openssl-1.0.0k/crypto/evp/m_sha1.c
+--- openssl-1.0.0k/crypto/evp/m_sha1.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_sha1.c 2013-02-19 20:12:54.589664762 +0100
@@ -82,7 +82,8 @@ static const EVP_MD sha1_md=
NID_sha1,
NID_sha1WithRSAEncryption,
@@ -1700,9 +1697,9 @@ diff -up openssl-1.0.0f/crypto/evp/m_sha1.c.fips openssl-1.0.0f/crypto/evp/m_sha
init512,
update512,
final512,
-diff -up openssl-1.0.0f/crypto/evp/m_wp.c.fips openssl-1.0.0f/crypto/evp/m_wp.c
---- openssl-1.0.0f/crypto/evp/m_wp.c.fips 2005-11-30 21:57:23.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/m_wp.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/m_wp.c.fips openssl-1.0.0k/crypto/evp/m_wp.c
+--- openssl-1.0.0k/crypto/evp/m_wp.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/m_wp.c 2013-02-19 20:12:54.589664762 +0100
@@ -9,6 +9,7 @@
#include <openssl/objects.h>
#include <openssl/x509.h>
@@ -1711,9 +1708,9 @@ diff -up openssl-1.0.0f/crypto/evp/m_wp.c.fips openssl-1.0.0f/crypto/evp/m_wp.c
static int init(EVP_MD_CTX *ctx)
{ return WHIRLPOOL_Init(ctx->md_data); }
-diff -up openssl-1.0.0f/crypto/evp/names.c.fips openssl-1.0.0f/crypto/evp/names.c
---- openssl-1.0.0f/crypto/evp/names.c.fips 2010-03-06 21:47:45.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/names.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/names.c.fips openssl-1.0.0k/crypto/evp/names.c
+--- openssl-1.0.0k/crypto/evp/names.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/names.c 2013-02-19 20:12:54.589664762 +0100
@@ -66,6 +66,10 @@ int EVP_add_cipher(const EVP_CIPHER *c)
{
int r;
@@ -1736,9 +1733,9 @@ diff -up openssl-1.0.0f/crypto/evp/names.c.fips openssl-1.0.0f/crypto/evp/names.
name=OBJ_nid2sn(md->type);
r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md);
if (r == 0) return(0);
-diff -up openssl-1.0.0f/crypto/evp/p_sign.c.fips openssl-1.0.0f/crypto/evp/p_sign.c
---- openssl-1.0.0f/crypto/evp/p_sign.c.fips 2010-11-27 18:34:57.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/p_sign.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/p_sign.c.fips openssl-1.0.0k/crypto/evp/p_sign.c
+--- openssl-1.0.0k/crypto/evp/p_sign.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/p_sign.c 2013-02-19 20:12:54.589664762 +0100
@@ -61,6 +61,7 @@
#include <openssl/evp.h>
#include <openssl/objects.h>
@@ -1770,9 +1767,9 @@ diff -up openssl-1.0.0f/crypto/evp/p_sign.c.fips openssl-1.0.0f/crypto/evp/p_sig
if (EVP_PKEY_sign(pkctx, sigret, &sltmp, m, m_len) <= 0)
goto err;
*siglen = sltmp;
-diff -up openssl-1.0.0f/crypto/evp/p_verify.c.fips openssl-1.0.0f/crypto/evp/p_verify.c
---- openssl-1.0.0f/crypto/evp/p_verify.c.fips 2010-11-27 18:34:57.000000000 +0100
-+++ openssl-1.0.0f/crypto/evp/p_verify.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/evp/p_verify.c.fips openssl-1.0.0k/crypto/evp/p_verify.c
+--- openssl-1.0.0k/crypto/evp/p_verify.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/evp/p_verify.c 2013-02-19 20:12:54.590664781 +0100
@@ -61,6 +61,7 @@
#include <openssl/evp.h>
#include <openssl/objects.h>
@@ -1804,9 +1801,9 @@ diff -up openssl-1.0.0f/crypto/evp/p_verify.c.fips openssl-1.0.0f/crypto/evp/p_v
i = EVP_PKEY_verify(pkctx, sigbuf, siglen, m, m_len);
err:
EVP_PKEY_CTX_free(pkctx);
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c.fips 2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_aesavs.c 2013-02-19 20:12:54.591664800 +0100
@@ -0,0 +1,939 @@
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
@@ -2747,9 +2744,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_aesavs.c.fips openssl-1.0.0f/crypt
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c.fips 2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_desmovs.c 2013-02-19 20:12:54.591664800 +0100
@@ -0,0 +1,702 @@
+/* ====================================================================
+ * Copyright (c) 2004 The OpenSSL Project. All rights reserved.
@@ -3453,9 +3450,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_desmovs.c.fips openssl-1.0.0f/cryp
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c.fips 2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_dssvs.c 2013-02-19 20:12:54.591664800 +0100
@@ -0,0 +1,537 @@
+#include <openssl/opensslconf.h>
+
@@ -3994,9 +3991,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_dssvs.c.fips openssl-1.0.0f/crypto
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c.fips 2013-02-19 20:12:54.591664800 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rngvs.c 2013-02-19 20:12:54.591664800 +0100
@@ -0,0 +1,230 @@
+/*
+ * Crude test driver for processing the VST and MCT testvector files
@@ -4228,9 +4225,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rngvs.c.fips openssl-1.0.0f/crypto
+ return 0;
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c.fips 2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rsagtest.c 2013-02-19 20:12:54.592664819 +0100
@@ -0,0 +1,390 @@
+/* fips_rsagtest.c */
+/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -4622,9 +4619,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsagtest.c.fips openssl-1.0.0f/cry
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c.fips 2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rsastest.c 2013-02-19 20:12:54.592664819 +0100
@@ -0,0 +1,370 @@
+/* fips_rsastest.c */
+/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -4996,9 +4993,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsastest.c.fips openssl-1.0.0f/cry
+ return ret;
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c.fips 2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_rsavtest.c 2013-02-19 20:12:54.592664819 +0100
@@ -0,0 +1,377 @@
+/* fips_rsavtest.c */
+/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -5377,9 +5374,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_rsavtest.c.fips openssl-1.0.0f/cry
+ return ret;
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c.fips openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c
---- openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c.fips openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c
+--- openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c.fips 2013-02-19 20:12:54.592664819 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_shatest.c 2013-02-19 20:12:54.592664819 +0100
@@ -0,0 +1,388 @@
+/* fips_shatest.c */
+/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -5769,9 +5766,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_shatest.c.fips openssl-1.0.0f/cryp
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/cavs/fips_utl.h.fips openssl-1.0.0f/crypto/fips/cavs/fips_utl.h
---- openssl-1.0.0f/crypto/fips/cavs/fips_utl.h.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/cavs/fips_utl.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/cavs/fips_utl.h.fips openssl-1.0.0k/crypto/fips/cavs/fips_utl.h
+--- openssl-1.0.0k/crypto/fips/cavs/fips_utl.h.fips 2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips/cavs/fips_utl.h 2013-02-19 20:12:54.593664838 +0100
@@ -0,0 +1,343 @@
+/* ====================================================================
+ * Copyright (c) 2007 The OpenSSL Project. All rights reserved.
@@ -6116,9 +6113,9 @@ diff -up openssl-1.0.0f/crypto/fips/cavs/fips_utl.h.fips openssl-1.0.0f/crypto/f
+#endif
+ }
+
-diff -up openssl-1.0.0f/crypto/fips_err.c.fips openssl-1.0.0f/crypto/fips_err.c
---- openssl-1.0.0f/crypto/fips_err.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips_err.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips_err.c.fips openssl-1.0.0k/crypto/fips_err.c
+--- openssl-1.0.0k/crypto/fips_err.c.fips 2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips_err.c 2013-02-19 20:12:54.593664838 +0100
@@ -0,0 +1,7 @@
+#include <openssl/opensslconf.h>
+
@@ -6127,9 +6124,9 @@ diff -up openssl-1.0.0f/crypto/fips_err.c.fips openssl-1.0.0f/crypto/fips_err.c
+#else
+static void *dummy=&dummy;
+#endif
-diff -up openssl-1.0.0f/crypto/fips_err.h.fips openssl-1.0.0f/crypto/fips_err.h
---- openssl-1.0.0f/crypto/fips_err.h.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips_err.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips_err.h.fips openssl-1.0.0k/crypto/fips_err.h
+--- openssl-1.0.0k/crypto/fips_err.h.fips 2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips_err.h 2013-02-19 20:12:54.593664838 +0100
@@ -0,0 +1,137 @@
+/* crypto/fips_err.h */
+/* ====================================================================
@@ -6268,9 +6265,9 @@ diff -up openssl-1.0.0f/crypto/fips_err.h.fips openssl-1.0.0f/crypto/fips_err.h
+ }
+#endif
+ }
-diff -up openssl-1.0.0f/crypto/fips/fips_aes_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_aes_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_aes_selftest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_aes_selftest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_aes_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_aes_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_aes_selftest.c.fips 2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips/fips_aes_selftest.c 2013-02-19 20:12:54.593664838 +0100
@@ -0,0 +1,103 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -6375,9 +6372,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_aes_selftest.c.fips openssl-1.0.0f/cryp
+ return ret;
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips.c.fips openssl-1.0.0f/crypto/fips/fips.c
---- openssl-1.0.0f/crypto/fips/fips.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips.c.fips openssl-1.0.0k/crypto/fips/fips.c
+--- openssl-1.0.0k/crypto/fips/fips.c.fips 2013-02-19 20:12:54.593664838 +0100
++++ openssl-1.0.0k/crypto/fips/fips.c 2013-02-19 20:12:54.593664838 +0100
@@ -0,0 +1,419 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -6798,9 +6795,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips.c.fips openssl-1.0.0f/crypto/fips/fips.
+
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_des_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_des_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_des_selftest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_des_selftest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_des_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_des_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_des_selftest.c.fips 2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_des_selftest.c 2013-02-19 20:12:54.594664857 +0100
@@ -0,0 +1,139 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -6941,9 +6938,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_des_selftest.c.fips openssl-1.0.0f/cryp
+ return ret;
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c.fips 2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_dsa_selftest.c 2013-02-19 20:12:54.594664857 +0100
@@ -0,0 +1,186 @@
+/* crypto/dsa/dsatest.c */
+/* Copyright (C) 1995-1998 Eric Young (eay at cryptsoft.com)
@@ -7131,9 +7128,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_dsa_selftest.c.fips openssl-1.0.0f/cryp
+ return ret;
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips.h.fips openssl-1.0.0f/crypto/fips/fips.h
---- openssl-1.0.0f/crypto/fips/fips.h.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips.h.fips openssl-1.0.0k/crypto/fips/fips.h
+--- openssl-1.0.0k/crypto/fips/fips.h.fips 2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips.h 2013-02-19 20:12:54.594664857 +0100
@@ -0,0 +1,163 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -7298,9 +7295,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips.h.fips openssl-1.0.0f/crypto/fips/fips.
+}
+#endif
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c.fips 2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_hmac_selftest.c 2013-02-19 20:12:54.594664857 +0100
@@ -0,0 +1,137 @@
+/* ====================================================================
+ * Copyright (c) 2005 The OpenSSL Project. All rights reserved.
@@ -7439,9 +7436,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_hmac_selftest.c.fips openssl-1.0.0f/cry
+ return 1;
+ }
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rand.c.fips openssl-1.0.0f/crypto/fips/fips_rand.c
---- openssl-1.0.0f/crypto/fips/fips_rand.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rand.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rand.c.fips openssl-1.0.0k/crypto/fips/fips_rand.c
+--- openssl-1.0.0k/crypto/fips/fips_rand.c.fips 2013-02-19 20:12:54.594664857 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rand.c 2013-02-19 20:12:54.594664857 +0100
@@ -0,0 +1,412 @@
+/* ====================================================================
+ * Copyright (c) 2007 The OpenSSL Project. All rights reserved.
@@ -7855,9 +7852,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rand.c.fips openssl-1.0.0f/crypto/fips/
+}
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rand.h.fips openssl-1.0.0f/crypto/fips/fips_rand.h
---- openssl-1.0.0f/crypto/fips/fips_rand.h.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rand.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rand.h.fips openssl-1.0.0k/crypto/fips/fips_rand.h
+--- openssl-1.0.0k/crypto/fips/fips_rand.h.fips 2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rand.h 2013-02-19 20:12:54.595664876 +0100
@@ -0,0 +1,77 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -7936,9 +7933,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rand.h.fips openssl-1.0.0f/crypto/fips/
+#endif
+#endif
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rand_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_rand_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_rand_selftest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rand_selftest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rand_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_rand_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_rand_selftest.c.fips 2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rand_selftest.c 2013-02-19 20:12:54.595664876 +0100
@@ -0,0 +1,373 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -8313,9 +8310,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rand_selftest.c.fips openssl-1.0.0f/cry
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_randtest.c.fips openssl-1.0.0f/crypto/fips/fips_randtest.c
---- openssl-1.0.0f/crypto/fips/fips_randtest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_randtest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_randtest.c.fips openssl-1.0.0k/crypto/fips/fips_randtest.c
+--- openssl-1.0.0k/crypto/fips/fips_randtest.c.fips 2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_randtest.c 2013-02-19 20:12:54.595664876 +0100
@@ -0,0 +1,248 @@
+/* Copyright (C) 1995-1998 Eric Young (eay at cryptsoft.com)
+ * All rights reserved.
@@ -8565,9 +8562,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_randtest.c.fips openssl-1.0.0f/crypto/f
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c.fips 2013-02-19 20:12:54.595664876 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rsa_selftest.c 2013-02-19 20:12:54.595664876 +0100
@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2003-2007 The OpenSSL Project. All rights reserved.
@@ -9010,9 +9007,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rsa_selftest.c.fips openssl-1.0.0f/cryp
+ }
+
+#endif /* def OPENSSL_FIPS */
-diff -up openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c.fips openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c
---- openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c.fips openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c
+--- openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c.fips 2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_rsa_x931g.c 2013-02-19 20:12:54.596664895 +0100
@@ -0,0 +1,281 @@
+/* crypto/rsa/rsa_gen.c */
+/* Copyright (C) 1995-1998 Eric Young (eay at cryptsoft.com)
@@ -9295,9 +9292,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_rsa_x931g.c.fips openssl-1.0.0f/crypto/
+ return 0;
+
+ }
-diff -up openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c.fips openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c
---- openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c.fips openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c
+--- openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c.fips 2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_sha1_selftest.c 2013-02-19 20:12:54.596664895 +0100
@@ -0,0 +1,99 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -9398,9 +9395,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_sha1_selftest.c.fips openssl-1.0.0f/cry
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c.fips openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c
---- openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.fips openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c
+--- openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.fips 2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c 2013-02-19 20:12:54.596664895 +0100
@@ -0,0 +1,173 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -9575,9 +9572,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_standalone_sha1.c.fips openssl-1.0.0f/c
+ }
+
+
-diff -up openssl-1.0.0f/crypto/fips/fips_test_suite.c.fips openssl-1.0.0f/crypto/fips/fips_test_suite.c
---- openssl-1.0.0f/crypto/fips/fips_test_suite.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/fips_test_suite.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/fips_test_suite.c.fips openssl-1.0.0k/crypto/fips/fips_test_suite.c
+--- openssl-1.0.0k/crypto/fips/fips_test_suite.c.fips 2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips/fips_test_suite.c 2013-02-19 20:12:54.596664895 +0100
@@ -0,0 +1,588 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -10167,9 +10164,9 @@ diff -up openssl-1.0.0f/crypto/fips/fips_test_suite.c.fips openssl-1.0.0f/crypto
+ }
+
+#endif
-diff -up openssl-1.0.0f/crypto/fips_locl.h.fips openssl-1.0.0f/crypto/fips_locl.h
---- openssl-1.0.0f/crypto/fips_locl.h.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips_locl.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips_locl.h.fips openssl-1.0.0k/crypto/fips_locl.h
+--- openssl-1.0.0k/crypto/fips_locl.h.fips 2013-02-19 20:12:54.596664895 +0100
++++ openssl-1.0.0k/crypto/fips_locl.h 2013-02-19 20:12:54.596664895 +0100
@@ -0,0 +1,72 @@
+/* ====================================================================
+ * Copyright (c) 2003 The OpenSSL Project. All rights reserved.
@@ -10243,9 +10240,9 @@ diff -up openssl-1.0.0f/crypto/fips_locl.h.fips openssl-1.0.0f/crypto/fips_locl.
+}
+#endif
+#endif
-diff -up openssl-1.0.0f/crypto/fips/Makefile.fips openssl-1.0.0f/crypto/fips/Makefile
---- openssl-1.0.0f/crypto/fips/Makefile.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/fips/Makefile 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/fips/Makefile.fips openssl-1.0.0k/crypto/fips/Makefile
+--- openssl-1.0.0k/crypto/fips/Makefile.fips 2013-02-19 20:12:54.597664913 +0100
++++ openssl-1.0.0k/crypto/fips/Makefile 2013-02-19 20:12:54.597664913 +0100
@@ -0,0 +1,81 @@
+#
+# OpenSSL/crypto/fips/Makefile
@@ -10328,9 +10325,9 @@ diff -up openssl-1.0.0f/crypto/fips/Makefile.fips openssl-1.0.0f/crypto/fips/Mak
+
+# DO NOT DELETE THIS LINE -- make depend depends on it.
+
-diff -up openssl-1.0.0f/crypto/hmac/hmac.c.fips openssl-1.0.0f/crypto/hmac/hmac.c
---- openssl-1.0.0f/crypto/hmac/hmac.c.fips 2010-06-15 19:25:09.000000000 +0200
-+++ openssl-1.0.0f/crypto/hmac/hmac.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/hmac/hmac.c.fips openssl-1.0.0k/crypto/hmac/hmac.c
+--- openssl-1.0.0k/crypto/hmac/hmac.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/hmac/hmac.c 2013-02-19 20:12:54.597664913 +0100
@@ -77,6 +77,13 @@ int HMAC_Init_ex(HMAC_CTX *ctx, const vo
if (key != NULL)
@@ -10345,9 +10342,9 @@ diff -up openssl-1.0.0f/crypto/hmac/hmac.c.fips openssl-1.0.0f/crypto/hmac/hmac.
reset=1;
j=EVP_MD_block_size(md);
OPENSSL_assert(j <= (int)sizeof(ctx->key));
-diff -up openssl-1.0.0f/crypto/Makefile.fips openssl-1.0.0f/crypto/Makefile
---- openssl-1.0.0f/crypto/Makefile.fips 2010-07-27 00:09:59.000000000 +0200
-+++ openssl-1.0.0f/crypto/Makefile 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/Makefile.fips openssl-1.0.0k/crypto/Makefile
+--- openssl-1.0.0k/crypto/Makefile.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/Makefile 2013-02-19 20:12:54.597664913 +0100
@@ -34,14 +34,14 @@ GENERAL=Makefile README crypto-lib.com i
LIB= $(TOP)/libcrypto.a
@@ -10366,47 +10363,9 @@ diff -up openssl-1.0.0f/crypto/Makefile.fips openssl-1.0.0f/crypto/Makefile
ALL= $(GENERAL) $(SRC) $(HEADER)
-diff -up openssl-1.0.0f/crypto/mdc2/mdc2dgst.c.fips openssl-1.0.0f/crypto/mdc2/mdc2dgst.c
---- openssl-1.0.0f/crypto/mdc2/mdc2dgst.c.fips 2004-07-25 21:10:41.000000000 +0200
-+++ openssl-1.0.0f/crypto/mdc2/mdc2dgst.c 2012-01-05 13:22:30.000000000 +0100
-@@ -61,6 +61,11 @@
- #include <string.h>
- #include <openssl/des.h>
- #include <openssl/mdc2.h>
-+#include <openssl/err.h>
-+#ifdef OPENSSL_FIPS
-+#include <openssl/fips.h>
-+#endif
-+
-
- #undef c2l
- #define c2l(c,l) (l =((DES_LONG)(*((c)++))) , \
-@@ -75,7 +80,7 @@
- *((c)++)=(unsigned char)(((l)>>24L)&0xff))
-
- static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
--int MDC2_Init(MDC2_CTX *c)
-+FIPS_NON_FIPS_MD_Init(MDC2)
- {
- c->num=0;
- c->pad_type=1;
-diff -up openssl-1.0.0f/crypto/mdc2/mdc2.h.fips openssl-1.0.0f/crypto/mdc2/mdc2.h
---- openssl-1.0.0f/crypto/mdc2/mdc2.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/mdc2/mdc2.h 2012-01-05 13:22:30.000000000 +0100
-@@ -80,7 +80,9 @@ typedef struct mdc2_ctx_st
- int pad_type; /* either 1 or 2, default 1 */
- } MDC2_CTX;
-
--
-+#ifdef OPENSSL_FIPS
-+int private_MDC2_Init(MDC2_CTX *c);
-+#endif
- int MDC2_Init(MDC2_CTX *c);
- int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
- int MDC2_Final(unsigned char *md, MDC2_CTX *c);
-diff -up openssl-1.0.0f/crypto/md2/md2_dgst.c.fips openssl-1.0.0f/crypto/md2/md2_dgst.c
---- openssl-1.0.0f/crypto/md2/md2_dgst.c.fips 2007-08-31 12:12:35.000000000 +0200
-+++ openssl-1.0.0f/crypto/md2/md2_dgst.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md2/md2_dgst.c.fips openssl-1.0.0k/crypto/md2/md2_dgst.c
+--- openssl-1.0.0k/crypto/md2/md2_dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/md2/md2_dgst.c 2013-02-19 20:12:54.597664913 +0100
@@ -62,6 +62,11 @@
#include <openssl/md2.h>
#include <openssl/opensslv.h>
@@ -10428,9 +10387,9 @@ diff -up openssl-1.0.0f/crypto/md2/md2_dgst.c.fips openssl-1.0.0f/crypto/md2/md2
{
c->num=0;
memset(c->state,0,sizeof c->state);
-diff -up openssl-1.0.0f/crypto/md2/md2.h.fips openssl-1.0.0f/crypto/md2/md2.h
---- openssl-1.0.0f/crypto/md2/md2.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/md2/md2.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md2/md2.h.fips openssl-1.0.0k/crypto/md2/md2.h
+--- openssl-1.0.0k/crypto/md2/md2.h.fips 2013-02-19 20:12:54.348660189 +0100
++++ openssl-1.0.0k/crypto/md2/md2.h 2013-02-19 20:12:54.597664913 +0100
@@ -81,6 +81,9 @@ typedef struct MD2state_st
} MD2_CTX;
@@ -10441,9 +10400,9 @@ diff -up openssl-1.0.0f/crypto/md2/md2.h.fips openssl-1.0.0f/crypto/md2/md2.h
int MD2_Init(MD2_CTX *c);
int MD2_Update(MD2_CTX *c, const unsigned char *data, size_t len);
int MD2_Final(unsigned char *md, MD2_CTX *c);
-diff -up openssl-1.0.0f/crypto/md4/md4_dgst.c.fips openssl-1.0.0f/crypto/md4/md4_dgst.c
---- openssl-1.0.0f/crypto/md4/md4_dgst.c.fips 2007-01-21 14:07:11.000000000 +0100
-+++ openssl-1.0.0f/crypto/md4/md4_dgst.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md4/md4_dgst.c.fips openssl-1.0.0k/crypto/md4/md4_dgst.c
+--- openssl-1.0.0k/crypto/md4/md4_dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/md4/md4_dgst.c 2013-02-19 20:12:54.598664931 +0100
@@ -59,6 +59,11 @@
#include <stdio.h>
#include "md4_locl.h"
@@ -10465,9 +10424,9 @@ diff -up openssl-1.0.0f/crypto/md4/md4_dgst.c.fips openssl-1.0.0f/crypto/md4/md4
{
memset (c,0,sizeof(*c));
c->A=INIT_DATA_A;
-diff -up openssl-1.0.0f/crypto/md4/md4.h.fips openssl-1.0.0f/crypto/md4/md4.h
---- openssl-1.0.0f/crypto/md4/md4.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/md4/md4.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md4/md4.h.fips openssl-1.0.0k/crypto/md4/md4.h
+--- openssl-1.0.0k/crypto/md4/md4.h.fips 2013-02-19 20:12:54.268658671 +0100
++++ openssl-1.0.0k/crypto/md4/md4.h 2013-02-19 20:12:54.598664931 +0100
@@ -105,6 +105,9 @@ typedef struct MD4state_st
unsigned int num;
} MD4_CTX;
@@ -10478,9 +10437,9 @@ diff -up openssl-1.0.0f/crypto/md4/md4.h.fips openssl-1.0.0f/crypto/md4/md4.h
int MD4_Init(MD4_CTX *c);
int MD4_Update(MD4_CTX *c, const void *data, size_t len);
int MD4_Final(unsigned char *md, MD4_CTX *c);
-diff -up openssl-1.0.0f/crypto/md5/md5_dgst.c.fips openssl-1.0.0f/crypto/md5/md5_dgst.c
---- openssl-1.0.0f/crypto/md5/md5_dgst.c.fips 2007-01-21 14:07:11.000000000 +0100
-+++ openssl-1.0.0f/crypto/md5/md5_dgst.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md5/md5_dgst.c.fips openssl-1.0.0k/crypto/md5/md5_dgst.c
+--- openssl-1.0.0k/crypto/md5/md5_dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/md5/md5_dgst.c 2013-02-19 20:12:54.598664931 +0100
@@ -59,6 +59,11 @@
#include <stdio.h>
#include "md5_locl.h"
@@ -10502,9 +10461,9 @@ diff -up openssl-1.0.0f/crypto/md5/md5_dgst.c.fips openssl-1.0.0f/crypto/md5/md5
{
memset (c,0,sizeof(*c));
c->A=INIT_DATA_A;
-diff -up openssl-1.0.0f/crypto/md5/md5.h.fips openssl-1.0.0f/crypto/md5/md5.h
---- openssl-1.0.0f/crypto/md5/md5.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/md5/md5.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/md5/md5.h.fips openssl-1.0.0k/crypto/md5/md5.h
+--- openssl-1.0.0k/crypto/md5/md5.h.fips 2013-02-19 20:12:54.012653813 +0100
++++ openssl-1.0.0k/crypto/md5/md5.h 2013-02-19 20:12:54.598664931 +0100
@@ -105,6 +105,9 @@ typedef struct MD5state_st
unsigned int num;
} MD5_CTX;
@@ -10515,10 +10474,48 @@ diff -up openssl-1.0.0f/crypto/md5/md5.h.fips openssl-1.0.0f/crypto/md5/md5.h
int MD5_Init(MD5_CTX *c);
int MD5_Update(MD5_CTX *c, const void *data, size_t len);
int MD5_Final(unsigned char *md, MD5_CTX *c);
-diff -up openssl-1.0.0f/crypto/mem.c.fips openssl-1.0.0f/crypto/mem.c
---- openssl-1.0.0f/crypto/mem.c.fips 2008-11-12 04:57:47.000000000 +0100
-+++ openssl-1.0.0f/crypto/mem.c 2012-01-05 13:22:30.000000000 +0100
-@@ -101,7 +101,7 @@ static void (*free_locked_func)(void *)
+diff -up openssl-1.0.0k/crypto/mdc2/mdc2dgst.c.fips openssl-1.0.0k/crypto/mdc2/mdc2dgst.c
+--- openssl-1.0.0k/crypto/mdc2/mdc2dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/mdc2/mdc2dgst.c 2013-02-19 20:12:54.597664913 +0100
+@@ -61,6 +61,11 @@
+ #include <string.h>
+ #include <openssl/des.h>
+ #include <openssl/mdc2.h>
++#include <openssl/err.h>
++#ifdef OPENSSL_FIPS
++#include <openssl/fips.h>
++#endif
++
+
+ #undef c2l
+ #define c2l(c,l) (l =((DES_LONG)(*((c)++))) , \
+@@ -75,7 +80,7 @@
+ *((c)++)=(unsigned char)(((l)>>24L)&0xff))
+
+ static void mdc2_body(MDC2_CTX *c, const unsigned char *in, size_t len);
+-int MDC2_Init(MDC2_CTX *c)
++FIPS_NON_FIPS_MD_Init(MDC2)
+ {
+ c->num=0;
+ c->pad_type=1;
+diff -up openssl-1.0.0k/crypto/mdc2/mdc2.h.fips openssl-1.0.0k/crypto/mdc2/mdc2.h
+--- openssl-1.0.0k/crypto/mdc2/mdc2.h.fips 2013-02-19 20:12:54.061654741 +0100
++++ openssl-1.0.0k/crypto/mdc2/mdc2.h 2013-02-19 20:12:54.597664913 +0100
+@@ -80,7 +80,9 @@ typedef struct mdc2_ctx_st
+ int pad_type; /* either 1 or 2, default 1 */
+ } MDC2_CTX;
+
+-
++#ifdef OPENSSL_FIPS
++int private_MDC2_Init(MDC2_CTX *c);
++#endif
+ int MDC2_Init(MDC2_CTX *c);
+ int MDC2_Update(MDC2_CTX *c, const unsigned char *data, size_t len);
+ int MDC2_Final(unsigned char *md, MDC2_CTX *c);
+diff -up openssl-1.0.0k/crypto/mem.c.fips openssl-1.0.0k/crypto/mem.c
+--- openssl-1.0.0k/crypto/mem.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/mem.c 2013-02-19 20:12:54.598664931 +0100
+@@ -101,7 +101,7 @@ static void (*free_locked_func)(void *)
/* may be changed as long as 'allow_customize_debug' is set */
/* XXX use correct function pointer types */
@@ -10527,9 +10524,9 @@ diff -up openssl-1.0.0f/crypto/mem.c.fips openssl-1.0.0f/crypto/mem.c
/* use default functions from mem_dbg.c */
static void (*malloc_debug_func)(void *,int,const char *,int,int)
= CRYPTO_dbg_malloc;
-diff -up openssl-1.0.0f/crypto/o_init.c.fips openssl-1.0.0f/crypto/o_init.c
---- openssl-1.0.0f/crypto/o_init.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/o_init.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/o_init.c.fips openssl-1.0.0k/crypto/o_init.c
+--- openssl-1.0.0k/crypto/o_init.c.fips 2013-02-19 20:12:54.598664931 +0100
++++ openssl-1.0.0k/crypto/o_init.c 2013-02-19 20:12:54.598664931 +0100
@@ -0,0 +1,80 @@
+/* o_init.c */
+/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -10611,9 +10608,9 @@ diff -up openssl-1.0.0f/crypto/o_init.c.fips openssl-1.0.0f/crypto/o_init.c
+ }
+
+
-diff -up openssl-1.0.0f/crypto/opensslconf.h.in.fips openssl-1.0.0f/crypto/opensslconf.h.in
---- openssl-1.0.0f/crypto/opensslconf.h.in.fips 2005-12-16 11:37:23.000000000 +0100
-+++ openssl-1.0.0f/crypto/opensslconf.h.in 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/opensslconf.h.in.fips openssl-1.0.0k/crypto/opensslconf.h.in
+--- openssl-1.0.0k/crypto/opensslconf.h.in.fips 2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/opensslconf.h.in 2013-02-19 20:12:54.599664950 +0100
@@ -1,5 +1,20 @@
/* crypto/opensslconf.h.in */
@@ -10635,9 +10632,9 @@ diff -up openssl-1.0.0f/crypto/opensslconf.h.in.fips openssl-1.0.0f/crypto/opens
/* Generate 80386 code? */
#undef I386_ONLY
-diff -up openssl-1.0.0f/crypto/pkcs12/p12_crt.c.fips openssl-1.0.0f/crypto/pkcs12/p12_crt.c
---- openssl-1.0.0f/crypto/pkcs12/p12_crt.c.fips 2009-03-09 14:08:04.000000000 +0100
-+++ openssl-1.0.0f/crypto/pkcs12/p12_crt.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/pkcs12/p12_crt.c.fips openssl-1.0.0k/crypto/pkcs12/p12_crt.c
+--- openssl-1.0.0k/crypto/pkcs12/p12_crt.c.fips 2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/pkcs12/p12_crt.c 2013-02-19 20:12:54.599664950 +0100
@@ -59,6 +59,10 @@
#include <stdio.h>
#include "cryptlib.h"
@@ -10664,9 +10661,9 @@ diff -up openssl-1.0.0f/crypto/pkcs12/p12_crt.c.fips openssl-1.0.0f/crypto/pkcs1
if (!nid_key)
nid_key = NID_pbe_WithSHA1And3_Key_TripleDES_CBC;
if (!iter)
-diff -up openssl-1.0.0f/crypto/rand/md_rand.c.fips openssl-1.0.0f/crypto/rand/md_rand.c
---- openssl-1.0.0f/crypto/rand/md_rand.c.fips 2010-06-16 15:17:22.000000000 +0200
-+++ openssl-1.0.0f/crypto/rand/md_rand.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/md_rand.c.fips openssl-1.0.0k/crypto/rand/md_rand.c
+--- openssl-1.0.0k/crypto/rand/md_rand.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/md_rand.c 2013-02-19 20:12:54.599664950 +0100
@@ -126,6 +126,10 @@
#include <openssl/crypto.h>
@@ -10693,9 +10690,9 @@ diff -up openssl-1.0.0f/crypto/rand/md_rand.c.fips openssl-1.0.0f/crypto/rand/md
#ifdef PREDICT
if (rand_predictable)
{
-diff -up openssl-1.0.0f/crypto/rand/rand_err.c.fips openssl-1.0.0f/crypto/rand/rand_err.c
---- openssl-1.0.0f/crypto/rand/rand_err.c.fips 2006-11-21 22:29:41.000000000 +0100
-+++ openssl-1.0.0f/crypto/rand/rand_err.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/rand_err.c.fips openssl-1.0.0k/crypto/rand/rand_err.c
+--- openssl-1.0.0k/crypto/rand/rand_err.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/rand_err.c 2013-02-19 20:12:54.599664950 +0100
@@ -70,6 +70,13 @@
static ERR_STRING_DATA RAND_str_functs[]=
@@ -10728,9 +10725,9 @@ diff -up openssl-1.0.0f/crypto/rand/rand_err.c.fips openssl-1.0.0f/crypto/rand/r
{0,NULL}
};
-diff -up openssl-1.0.0f/crypto/rand/rand.h.fips openssl-1.0.0f/crypto/rand/rand.h
---- openssl-1.0.0f/crypto/rand/rand.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rand/rand.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/rand.h.fips openssl-1.0.0k/crypto/rand/rand.h
+--- openssl-1.0.0k/crypto/rand/rand.h.fips 2013-02-19 20:12:54.071654932 +0100
++++ openssl-1.0.0k/crypto/rand/rand.h 2013-02-19 20:12:54.599664950 +0100
@@ -128,11 +128,28 @@ void ERR_load_RAND_strings(void);
/* Error codes for the RAND functions. */
@@ -10760,9 +10757,9 @@ diff -up openssl-1.0.0f/crypto/rand/rand.h.fips openssl-1.0.0f/crypto/rand/rand.
#ifdef __cplusplus
}
-diff -up openssl-1.0.0f/crypto/rand/rand_lib.c.fips openssl-1.0.0f/crypto/rand/rand_lib.c
---- openssl-1.0.0f/crypto/rand/rand_lib.c.fips 2008-11-12 04:58:04.000000000 +0100
-+++ openssl-1.0.0f/crypto/rand/rand_lib.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rand/rand_lib.c.fips openssl-1.0.0k/crypto/rand/rand_lib.c
+--- openssl-1.0.0k/crypto/rand/rand_lib.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/rand_lib.c 2013-02-19 20:12:54.599664950 +0100
@@ -60,6 +60,12 @@
#include <time.h>
#include "cryptlib.h"
@@ -10796,9 +10793,9 @@ diff -up openssl-1.0.0f/crypto/rand/rand_lib.c.fips openssl-1.0.0f/crypto/rand/r
return default_RAND_meth;
}
-diff -up openssl-1.0.0f/crypto/rc2/rc2.h.fips openssl-1.0.0f/crypto/rc2/rc2.h
---- openssl-1.0.0f/crypto/rc2/rc2.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc2/rc2.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc2/rc2.h.fips openssl-1.0.0k/crypto/rc2/rc2.h
+--- openssl-1.0.0k/crypto/rc2/rc2.h.fips 2013-02-19 20:12:54.216657683 +0100
++++ openssl-1.0.0k/crypto/rc2/rc2.h 2013-02-19 20:12:54.599664950 +0100
@@ -79,7 +79,9 @@ typedef struct rc2_key_st
RC2_INT data[64];
} RC2_KEY;
@@ -10810,9 +10807,9 @@ diff -up openssl-1.0.0f/crypto/rc2/rc2.h.fips openssl-1.0.0f/crypto/rc2/rc2.h
void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
int enc);
-diff -up openssl-1.0.0f/crypto/rc2/rc2_skey.c.fips openssl-1.0.0f/crypto/rc2/rc2_skey.c
---- openssl-1.0.0f/crypto/rc2/rc2_skey.c.fips 2007-09-18 23:10:32.000000000 +0200
-+++ openssl-1.0.0f/crypto/rc2/rc2_skey.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc2/rc2_skey.c.fips openssl-1.0.0k/crypto/rc2/rc2_skey.c
+--- openssl-1.0.0k/crypto/rc2/rc2_skey.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc2/rc2_skey.c 2013-02-19 20:12:54.600664970 +0100
@@ -57,6 +57,11 @@
*/
@@ -10846,31 +10843,9 @@ diff -up openssl-1.0.0f/crypto/rc2/rc2_skey.c.fips openssl-1.0.0f/crypto/rc2/rc2
int i,j;
unsigned char *k;
RC2_INT *ki;
-diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl.fips openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl
---- openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl.fips 2009-02-12 15:48:49.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/asm/rc4-s390x.pl 2012-01-05 13:22:30.000000000 +0100
-@@ -202,4 +202,6 @@ RC4_options:
- .string "rc4(8x,char)"
- ___
-
-+$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
-+
- print $code;
-diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl.fips openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl
---- openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl.fips 2009-04-27 21:31:04.000000000 +0200
-+++ openssl-1.0.0f/crypto/rc4/asm/rc4-x86_64.pl 2012-01-05 13:22:30.000000000 +0100
-@@ -499,6 +499,8 @@ ___
-
- $code =~ s/#([bwd])/$1/gm;
-
-+$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
-+
- print $code;
-
- close STDOUT;
-diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl.fips openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl
---- openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl.fips 2007-12-02 22:32:03.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.fips openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl 2013-02-19 20:12:54.600664970 +0100
@@ -166,8 +166,12 @@ $idx="edx";
&external_label("OPENSSL_ia32cap_P");
@@ -10894,9 +10869,31 @@ diff -up openssl-1.0.0f/crypto/rc4/asm/rc4-586.pl.fips openssl-1.0.0f/crypto/rc4
# const char *RC4_options(void);
&function_begin_B("RC4_options");
-diff -up openssl-1.0.0f/crypto/rc4/Makefile.fips openssl-1.0.0f/crypto/rc4/Makefile
---- openssl-1.0.0f/crypto/rc4/Makefile.fips 2009-02-11 11:01:36.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/Makefile 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl.fips openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-s390x.pl 2013-02-19 20:12:54.600664970 +0100
+@@ -202,4 +202,6 @@ RC4_options:
+ .string "rc4(8x,char)"
+ ___
+
++$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
++
+ print $code;
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.fips openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl 2013-02-19 20:12:54.600664970 +0100
+@@ -499,6 +499,8 @@ ___
+
+ $code =~ s/#([bwd])/$1/gm;
+
++$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
++
+ print $code;
+
+ close STDOUT;
+diff -up openssl-1.0.0k/crypto/rc4/Makefile.fips openssl-1.0.0k/crypto/rc4/Makefile
+--- openssl-1.0.0k/crypto/rc4/Makefile.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/Makefile 2013-02-19 20:12:54.600664970 +0100
@@ -21,8 +21,8 @@ TEST=rc4test.c
APPS=
@@ -10908,9 +10905,9 @@ diff -up openssl-1.0.0f/crypto/rc4/Makefile.fips openssl-1.0.0f/crypto/rc4/Makef
SRC= $(LIBSRC)
-diff -up openssl-1.0.0f/crypto/rc4/rc4_fblk.c.fips openssl-1.0.0f/crypto/rc4/rc4_fblk.c
---- openssl-1.0.0f/crypto/rc4/rc4_fblk.c.fips 2012-01-05 13:22:30.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/rc4_fblk.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/rc4_fblk.c.fips openssl-1.0.0k/crypto/rc4/rc4_fblk.c
+--- openssl-1.0.0k/crypto/rc4/rc4_fblk.c.fips 2013-02-19 20:12:54.601664990 +0100
++++ openssl-1.0.0k/crypto/rc4/rc4_fblk.c 2013-02-19 20:12:54.601664990 +0100
@@ -0,0 +1,75 @@
+/* crypto/rc4/rc4_fblk.c */
+/* Written by Dr Stephen N Henson (steve at openssl.org) for the OpenSSL
@@ -10987,9 +10984,9 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4_fblk.c.fips openssl-1.0.0f/crypto/rc4/rc4
+ }
+#endif
+
-diff -up openssl-1.0.0f/crypto/rc4/rc4.h.fips openssl-1.0.0f/crypto/rc4/rc4.h
---- openssl-1.0.0f/crypto/rc4/rc4.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/rc4.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/rc4.h.fips openssl-1.0.0k/crypto/rc4/rc4.h
+--- openssl-1.0.0k/crypto/rc4/rc4.h.fips 2013-02-19 20:12:53.860650927 +0100
++++ openssl-1.0.0k/crypto/rc4/rc4.h 2013-02-19 20:12:54.601664990 +0100
@@ -78,6 +78,9 @@ typedef struct rc4_key_st
@@ -11000,9 +10997,9 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4.h.fips openssl-1.0.0f/crypto/rc4/rc4.h
void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
unsigned char *outdata);
-diff -up openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips openssl-1.0.0f/crypto/rc4/rc4_skey.c
---- openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips 2007-01-21 14:07:13.000000000 +0100
-+++ openssl-1.0.0f/crypto/rc4/rc4_skey.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rc4/rc4_skey.c.fips openssl-1.0.0k/crypto/rc4/rc4_skey.c
+--- openssl-1.0.0k/crypto/rc4/rc4_skey.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rc4/rc4_skey.c 2013-02-19 20:12:54.601664990 +0100
@@ -59,6 +59,11 @@
#include <openssl/rc4.h>
#include "rc4_locl.h"
@@ -11027,7 +11024,7 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips openssl-1.0.0f/crypto/rc4/rc4
{
register RC4_INT tmp;
register int id1,id2;
-@@ -126,7 +135,12 @@ void RC4_set_key(RC4_KEY *key, int len,
+@@ -126,7 +135,12 @@ void RC4_set_key(RC4_KEY *key, int len,
* module...
* <appro at fy.chalmers.se>
*/
@@ -11040,9 +11037,9 @@ diff -up openssl-1.0.0f/crypto/rc4/rc4_skey.c.fips openssl-1.0.0f/crypto/rc4/rc4
unsigned char *cp=(unsigned char *)d;
for (i=0;i<256;i++) cp[i]=i;
-diff -up openssl-1.0.0f/crypto/ripemd/ripemd.h.fips openssl-1.0.0f/crypto/ripemd/ripemd.h
---- openssl-1.0.0f/crypto/ripemd/ripemd.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/ripemd/ripemd.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/ripemd/ripemd.h.fips openssl-1.0.0k/crypto/ripemd/ripemd.h
+--- openssl-1.0.0k/crypto/ripemd/ripemd.h.fips 2013-02-19 20:12:54.170656810 +0100
++++ openssl-1.0.0k/crypto/ripemd/ripemd.h 2013-02-19 20:12:54.601664990 +0100
@@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st
unsigned int num;
} RIPEMD160_CTX;
@@ -11053,9 +11050,9 @@ diff -up openssl-1.0.0f/crypto/ripemd/ripemd.h.fips openssl-1.0.0f/crypto/ripemd
int RIPEMD160_Init(RIPEMD160_CTX *c);
int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);
-diff -up openssl-1.0.0f/crypto/ripemd/rmd_dgst.c.fips openssl-1.0.0f/crypto/ripemd/rmd_dgst.c
---- openssl-1.0.0f/crypto/ripemd/rmd_dgst.c.fips 2007-01-21 14:07:13.000000000 +0100
-+++ openssl-1.0.0f/crypto/ripemd/rmd_dgst.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/ripemd/rmd_dgst.c.fips openssl-1.0.0k/crypto/ripemd/rmd_dgst.c
+--- openssl-1.0.0k/crypto/ripemd/rmd_dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/ripemd/rmd_dgst.c 2013-02-19 20:12:54.601664990 +0100
@@ -59,6 +59,11 @@
#include <stdio.h>
#include "rmd_locl.h"
@@ -11077,9 +11074,9 @@ diff -up openssl-1.0.0f/crypto/ripemd/rmd_dgst.c.fips openssl-1.0.0f/crypto/ripe
{
memset (c,0,sizeof(*c));
c->A=RIPEMD160_A;
-diff -up openssl-1.0.0f/crypto/rsa/rsa_eay.c.fips openssl-1.0.0f/crypto/rsa/rsa_eay.c
---- openssl-1.0.0f/crypto/rsa/rsa_eay.c.fips 2011-10-19 16:58:34.000000000 +0200
-+++ openssl-1.0.0f/crypto/rsa/rsa_eay.c 2012-01-05 13:27:00.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_eay.c.fips openssl-1.0.0k/crypto/rsa/rsa_eay.c
+--- openssl-1.0.0k/crypto/rsa/rsa_eay.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_eay.c 2013-02-19 20:12:54.601664990 +0100
@@ -114,6 +114,10 @@
#include <openssl/bn.h>
#include <openssl/rsa.h>
@@ -11340,9 +11337,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_eay.c.fips openssl-1.0.0f/crypto/rsa/rsa_
rsa->flags|=RSA_FLAG_CACHE_PUBLIC|RSA_FLAG_CACHE_PRIVATE;
return(1);
}
-diff -up openssl-1.0.0f/crypto/rsa/rsa_err.c.fips openssl-1.0.0f/crypto/rsa/rsa_err.c
---- openssl-1.0.0f/crypto/rsa/rsa_err.c.fips 2008-12-29 17:11:56.000000000 +0100
-+++ openssl-1.0.0f/crypto/rsa/rsa_err.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_err.c.fips openssl-1.0.0k/crypto/rsa/rsa_err.c
+--- openssl-1.0.0k/crypto/rsa/rsa_err.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_err.c 2013-02-19 20:12:54.602665009 +0100
@@ -111,8 +111,12 @@ static ERR_STRING_DATA RSA_str_functs[]=
{ERR_FUNC(RSA_F_RSA_PRINT_FP), "RSA_print_fp"},
{ERR_FUNC(RSA_F_RSA_PRIV_DECODE), "RSA_PRIV_DECODE"},
@@ -11369,9 +11366,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_err.c.fips openssl-1.0.0f/crypto/rsa/rsa_
{ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"},
{ERR_REASON(RSA_R_PADDING_CHECK_FAILED) ,"padding check failed"},
{ERR_REASON(RSA_R_P_NOT_PRIME) ,"p not prime"},
-diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_gen.c
---- openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips 2007-03-28 02:15:27.000000000 +0200
-+++ openssl-1.0.0f/crypto/rsa/rsa_gen.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_gen.c.fips openssl-1.0.0k/crypto/rsa/rsa_gen.c
+--- openssl-1.0.0k/crypto/rsa/rsa_gen.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_gen.c 2013-02-19 20:12:54.602665009 +0100
@@ -67,6 +67,82 @@
#include "cryptlib.h"
#include <openssl/bn.h>
@@ -11455,7 +11452,7 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_
static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb);
-@@ -90,6 +166,23 @@ static int rsa_builtin_keygen(RSA *rsa,
+@@ -90,6 +166,23 @@ static int rsa_builtin_keygen(RSA *rsa,
int bitsp,bitsq,ok= -1,n=0;
BN_CTX *ctx=NULL;
@@ -11479,7 +11476,7 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_
ctx=BN_CTX_new();
if (ctx == NULL) goto err;
BN_CTX_start(ctx);
-@@ -201,6 +294,17 @@ static int rsa_builtin_keygen(RSA *rsa,
+@@ -201,6 +294,17 @@ static int rsa_builtin_keygen(RSA *rsa,
p = rsa->p;
if (!BN_mod_inverse(rsa->iqmp,rsa->q,p,ctx)) goto err;
@@ -11497,9 +11494,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_gen.c.fips openssl-1.0.0f/crypto/rsa/rsa_
ok=1;
err:
if (ok == -1)
-diff -up openssl-1.0.0f/crypto/rsa/rsa.h.fips openssl-1.0.0f/crypto/rsa/rsa.h
---- openssl-1.0.0f/crypto/rsa/rsa.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/rsa/rsa.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa.h.fips openssl-1.0.0k/crypto/rsa/rsa.h
+--- openssl-1.0.0k/crypto/rsa/rsa.h.fips 2013-02-19 20:12:54.354660303 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa.h 2013-02-19 20:12:54.602665009 +0100
@@ -74,6 +74,21 @@
#error RSA is disabled.
#endif
@@ -11569,9 +11566,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa.h.fips openssl-1.0.0f/crypto/rsa/rsa.h
#define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE 148
#define RSA_R_PADDING_CHECK_FAILED 114
#define RSA_R_P_NOT_PRIME 128
-diff -up openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips openssl-1.0.0f/crypto/rsa/rsa_lib.c
---- openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips 2009-12-09 14:38:20.000000000 +0100
-+++ openssl-1.0.0f/crypto/rsa/rsa_lib.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_lib.c.fips openssl-1.0.0k/crypto/rsa/rsa_lib.c
+--- openssl-1.0.0k/crypto/rsa/rsa_lib.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_lib.c 2013-02-19 20:12:54.602665009 +0100
@@ -80,6 +80,13 @@ RSA *RSA_new(void)
void RSA_set_default_method(const RSA_METHOD *meth)
@@ -11633,7 +11630,7 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips openssl-1.0.0f/crypto/rsa/rsa_
return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
}
-@@ -306,6 +339,13 @@ int RSA_private_decrypt(int flen, const
+@@ -306,6 +339,13 @@ int RSA_private_decrypt(int flen, const
int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
RSA *rsa, int padding)
{
@@ -11647,9 +11644,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_lib.c.fips openssl-1.0.0f/crypto/rsa/rsa_
return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
}
-diff -up openssl-1.0.0f/crypto/rsa/rsa_sign.c.fips openssl-1.0.0f/crypto/rsa/rsa_sign.c
---- openssl-1.0.0f/crypto/rsa/rsa_sign.c.fips 2007-04-24 03:05:42.000000000 +0200
-+++ openssl-1.0.0f/crypto/rsa/rsa_sign.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/rsa/rsa_sign.c.fips openssl-1.0.0k/crypto/rsa/rsa_sign.c
+--- openssl-1.0.0k/crypto/rsa/rsa_sign.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rsa/rsa_sign.c 2013-02-19 20:12:54.603665028 +0100
@@ -130,7 +130,8 @@ int RSA_sign(int type, const unsigned ch
i2d_X509_SIG(&sig,&p);
s=tmps;
@@ -11681,9 +11678,9 @@ diff -up openssl-1.0.0f/crypto/rsa/rsa_sign.c.fips openssl-1.0.0f/crypto/rsa/rsa
if (i <= 0) goto err;
-diff -up openssl-1.0.0f/crypto/seed/seed.c.fips openssl-1.0.0f/crypto/seed/seed.c
---- openssl-1.0.0f/crypto/seed/seed.c.fips 2008-12-16 08:41:21.000000000 +0100
-+++ openssl-1.0.0f/crypto/seed/seed.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/seed/seed.c.fips openssl-1.0.0k/crypto/seed/seed.c
+--- openssl-1.0.0k/crypto/seed/seed.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/seed/seed.c 2013-02-19 20:12:54.603665028 +0100
@@ -34,6 +34,9 @@
#include <openssl/seed.h>
@@ -11699,7 +11696,7 @@ diff -up openssl-1.0.0f/crypto/seed/seed.c.fips openssl-1.0.0f/crypto/seed/seed.
#endif
+#ifdef OPENSSL_FIPS
- void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
++void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
+ {
+ if (FIPS_mode())
+ FIPS_BAD_ABORT(SEED)
@@ -11708,14 +11705,14 @@ diff -up openssl-1.0.0f/crypto/seed/seed.c.fips openssl-1.0.0f/crypto/seed/seed.
+
+void private_SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
+#else
-+void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
+ void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks)
+#endif
{
seed_word x1, x2, x3, x4;
seed_word t0, t1;
-diff -up openssl-1.0.0f/crypto/seed/seed.h.fips openssl-1.0.0f/crypto/seed/seed.h
---- openssl-1.0.0f/crypto/seed/seed.h.fips 2012-01-05 13:22:28.000000000 +0100
-+++ openssl-1.0.0f/crypto/seed/seed.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/seed/seed.h.fips openssl-1.0.0k/crypto/seed/seed.h
+--- openssl-1.0.0k/crypto/seed/seed.h.fips 2013-02-19 20:12:54.022654004 +0100
++++ openssl-1.0.0k/crypto/seed/seed.h 2013-02-19 20:12:54.603665028 +0100
@@ -117,6 +117,9 @@ typedef struct seed_key_st {
} SEED_KEY_SCHEDULE;
@@ -11726,57 +11723,9 @@ diff -up openssl-1.0.0f/crypto/seed/seed.h.fips openssl-1.0.0f/crypto/seed/seed.
void SEED_set_key(const unsigned char rawkey[SEED_KEY_LENGTH], SEED_KEY_SCHEDULE *ks);
void SEED_encrypt(const unsigned char s[SEED_BLOCK_SIZE], unsigned char d[SEED_BLOCK_SIZE], const SEED_KEY_SCHEDULE *ks);
-diff -up openssl-1.0.0f/crypto/sha/sha_dgst.c.fips openssl-1.0.0f/crypto/sha/sha_dgst.c
---- openssl-1.0.0f/crypto/sha/sha_dgst.c.fips 2007-01-21 14:07:14.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha_dgst.c 2012-01-05 13:22:30.000000000 +0100
-@@ -57,6 +57,12 @@
- */
-
- #include <openssl/opensslconf.h>
-+#include <openssl/crypto.h>
-+#ifdef OPENSSL_FIPS
-+#include <openssl/fips.h>
-+#endif
-+
-+#include <openssl/err.h>
- #if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
-
- #undef SHA_1
-diff -up openssl-1.0.0f/crypto/sha/sha.h.fips openssl-1.0.0f/crypto/sha/sha.h
---- openssl-1.0.0f/crypto/sha/sha.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha.h 2012-01-05 13:22:30.000000000 +0100
-@@ -106,6 +106,9 @@ typedef struct SHAstate_st
- } SHA_CTX;
-
- #ifndef OPENSSL_NO_SHA0
-+#ifdef OPENSSL_FIPS
-+int private_SHA_Init(SHA_CTX *c);
-+#endif
- int SHA_Init(SHA_CTX *c);
- int SHA_Update(SHA_CTX *c, const void *data, size_t len);
- int SHA_Final(unsigned char *md, SHA_CTX *c);
-diff -up openssl-1.0.0f/crypto/sha/sha_locl.h.fips openssl-1.0.0f/crypto/sha/sha_locl.h
---- openssl-1.0.0f/crypto/sha/sha_locl.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha_locl.h 2012-01-05 13:22:30.000000000 +0100
-@@ -122,8 +122,15 @@ void sha1_block_data_order (SHA_CTX *c,
- #define INIT_DATA_h3 0x10325476UL
- #define INIT_DATA_h4 0xc3d2e1f0UL
-
-+#if defined(SHA_0) && defined(OPENSSL_FIPS)
-+FIPS_NON_FIPS_MD_Init(SHA)
-+#else
- int HASH_INIT (SHA_CTX *c)
-+#endif
- {
-+#if defined(SHA_1) && defined(OPENSSL_FIPS)
-+ FIPS_selftest_check();
-+#endif
- memset (c,0,sizeof(*c));
- c->h0=INIT_DATA_h0;
- c->h1=INIT_DATA_h1;
-diff -up openssl-1.0.0f/crypto/sha/sha1dgst.c.fips openssl-1.0.0f/crypto/sha/sha1dgst.c
---- openssl-1.0.0f/crypto/sha/sha1dgst.c.fips 2007-01-21 14:07:14.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha1dgst.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha1dgst.c.fips openssl-1.0.0k/crypto/sha/sha1dgst.c
+--- openssl-1.0.0k/crypto/sha/sha1dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha1dgst.c 2013-02-19 20:12:54.604665047 +0100
@@ -63,6 +63,10 @@
#define SHA_1
@@ -11788,9 +11737,9 @@ diff -up openssl-1.0.0f/crypto/sha/sha1dgst.c.fips openssl-1.0.0f/crypto/sha/sha
const char SHA1_version[]="SHA1" OPENSSL_VERSION_PTEXT;
-diff -up openssl-1.0.0f/crypto/sha/sha256.c.fips openssl-1.0.0f/crypto/sha/sha256.c
---- openssl-1.0.0f/crypto/sha/sha256.c.fips 2007-01-21 14:07:14.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha256.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha256.c.fips openssl-1.0.0k/crypto/sha/sha256.c
+--- openssl-1.0.0k/crypto/sha/sha256.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha256.c 2013-02-19 20:12:54.604665047 +0100
@@ -12,12 +12,19 @@
#include <openssl/crypto.h>
@@ -11821,9 +11770,9 @@ diff -up openssl-1.0.0f/crypto/sha/sha256.c.fips openssl-1.0.0f/crypto/sha/sha25
memset (c,0,sizeof(*c));
c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL;
c->h[2]=0x3c6ef372UL; c->h[3]=0xa54ff53aUL;
-diff -up openssl-1.0.0f/crypto/sha/sha512.c.fips openssl-1.0.0f/crypto/sha/sha512.c
---- openssl-1.0.0f/crypto/sha/sha512.c.fips 2009-12-30 12:53:33.000000000 +0100
-+++ openssl-1.0.0f/crypto/sha/sha512.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha512.c.fips openssl-1.0.0k/crypto/sha/sha512.c
+--- openssl-1.0.0k/crypto/sha/sha512.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha512.c 2013-02-19 20:12:54.604665047 +0100
@@ -5,6 +5,10 @@
* ====================================================================
*/
@@ -11855,9 +11804,57 @@ diff -up openssl-1.0.0f/crypto/sha/sha512.c.fips openssl-1.0.0f/crypto/sha/sha51
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
-diff -up openssl-1.0.0f/crypto/whrlpool/whrlpool.h.fips openssl-1.0.0f/crypto/whrlpool/whrlpool.h
---- openssl-1.0.0f/crypto/whrlpool/whrlpool.h.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/crypto/whrlpool/whrlpool.h 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/sha/sha_dgst.c.fips openssl-1.0.0k/crypto/sha/sha_dgst.c
+--- openssl-1.0.0k/crypto/sha/sha_dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/sha_dgst.c 2013-02-19 20:12:54.603665028 +0100
+@@ -57,6 +57,12 @@
+ */
+
+ #include <openssl/opensslconf.h>
++#include <openssl/crypto.h>
++#ifdef OPENSSL_FIPS
++#include <openssl/fips.h>
++#endif
++
++#include <openssl/err.h>
+ #if !defined(OPENSSL_NO_SHA0) && !defined(OPENSSL_NO_SHA)
+
+ #undef SHA_1
+diff -up openssl-1.0.0k/crypto/sha/sha.h.fips openssl-1.0.0k/crypto/sha/sha.h
+--- openssl-1.0.0k/crypto/sha/sha.h.fips 2013-02-19 20:12:53.892651535 +0100
++++ openssl-1.0.0k/crypto/sha/sha.h 2013-02-19 20:12:54.603665028 +0100
+@@ -106,6 +106,9 @@ typedef struct SHAstate_st
+ } SHA_CTX;
+
+ #ifndef OPENSSL_NO_SHA0
++#ifdef OPENSSL_FIPS
++int private_SHA_Init(SHA_CTX *c);
++#endif
+ int SHA_Init(SHA_CTX *c);
+ int SHA_Update(SHA_CTX *c, const void *data, size_t len);
+ int SHA_Final(unsigned char *md, SHA_CTX *c);
+diff -up openssl-1.0.0k/crypto/sha/sha_locl.h.fips openssl-1.0.0k/crypto/sha/sha_locl.h
+--- openssl-1.0.0k/crypto/sha/sha_locl.h.fips 2013-02-19 20:12:53.897651631 +0100
++++ openssl-1.0.0k/crypto/sha/sha_locl.h 2013-02-19 20:12:54.603665028 +0100
+@@ -122,8 +122,15 @@ void sha1_block_data_order (SHA_CTX *c,
+ #define INIT_DATA_h3 0x10325476UL
+ #define INIT_DATA_h4 0xc3d2e1f0UL
+
++#if defined(SHA_0) && defined(OPENSSL_FIPS)
++FIPS_NON_FIPS_MD_Init(SHA)
++#else
+ int HASH_INIT (SHA_CTX *c)
++#endif
+ {
++#if defined(SHA_1) && defined(OPENSSL_FIPS)
++ FIPS_selftest_check();
++#endif
+ memset (c,0,sizeof(*c));
+ c->h0=INIT_DATA_h0;
+ c->h1=INIT_DATA_h1;
+diff -up openssl-1.0.0k/crypto/whrlpool/whrlpool.h.fips openssl-1.0.0k/crypto/whrlpool/whrlpool.h
+--- openssl-1.0.0k/crypto/whrlpool/whrlpool.h.fips 2013-02-19 20:12:54.187657134 +0100
++++ openssl-1.0.0k/crypto/whrlpool/whrlpool.h 2013-02-19 20:12:54.604665047 +0100
@@ -24,6 +24,9 @@ typedef struct {
} WHIRLPOOL_CTX;
@@ -11868,9 +11865,9 @@ diff -up openssl-1.0.0f/crypto/whrlpool/whrlpool.h.fips openssl-1.0.0f/crypto/wh
int WHIRLPOOL_Init (WHIRLPOOL_CTX *c);
int WHIRLPOOL_Update (WHIRLPOOL_CTX *c,const void *inp,size_t bytes);
void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits);
-diff -up openssl-1.0.0f/crypto/whrlpool/wp_dgst.c.fips openssl-1.0.0f/crypto/whrlpool/wp_dgst.c
---- openssl-1.0.0f/crypto/whrlpool/wp_dgst.c.fips 2008-12-29 13:35:49.000000000 +0100
-+++ openssl-1.0.0f/crypto/whrlpool/wp_dgst.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/crypto/whrlpool/wp_dgst.c.fips openssl-1.0.0k/crypto/whrlpool/wp_dgst.c
+--- openssl-1.0.0k/crypto/whrlpool/wp_dgst.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/whrlpool/wp_dgst.c 2013-02-19 20:12:54.604665047 +0100
@@ -53,8 +53,12 @@
#include "wp_locl.h"
@@ -11885,9 +11882,9 @@ diff -up openssl-1.0.0f/crypto/whrlpool/wp_dgst.c.fips openssl-1.0.0f/crypto/whr
{
memset (c,0,sizeof(*c));
return(1);
-diff -up openssl-1.0.0f/Makefile.org.fips openssl-1.0.0f/Makefile.org
---- openssl-1.0.0f/Makefile.org.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/Makefile.org 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/Makefile.org.fips openssl-1.0.0k/Makefile.org
+--- openssl-1.0.0k/Makefile.org.fips 2013-02-19 20:12:54.544663908 +0100
++++ openssl-1.0.0k/Makefile.org 2013-02-19 20:12:54.604665047 +0100
@@ -110,6 +110,9 @@ LIBKRB5=
ZLIB_INCLUDE=
LIBZLIB=
@@ -11915,9 +11912,124 @@ diff -up openssl-1.0.0f/Makefile.org.fips openssl-1.0.0f/Makefile.org
THIS=$${THIS:-$@} MAKEFILE=Makefile MAKEOVERRIDES=
# MAKEOVERRIDES= effectively "equalizes" GNU-ish and SysV-ish make flavors,
# which in turn eliminates ambiguities in variable treatment with -e.
-diff -up openssl-1.0.0f/ssl/ssl_ciph.c.fips openssl-1.0.0f/ssl/ssl_ciph.c
---- openssl-1.0.0f/ssl/ssl_ciph.c.fips 2011-12-02 13:51:05.000000000 +0100
-+++ openssl-1.0.0f/ssl/ssl_ciph.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/ssl/s23_clnt.c.fips openssl-1.0.0k/ssl/s23_clnt.c
+--- openssl-1.0.0k/ssl/s23_clnt.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s23_clnt.c 2013-02-19 20:12:54.607665104 +0100
+@@ -334,6 +334,14 @@ static int ssl23_client_hello(SSL *s)
+ version_major = TLS1_VERSION_MAJOR;
+ version_minor = TLS1_VERSION_MINOR;
+ }
++#ifdef OPENSSL_FIPS
++ else if(FIPS_mode())
++ {
++ SSLerr(SSL_F_SSL23_CLIENT_HELLO,
++ SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
++ return -1;
++ }
++#endif
+ else if (version == SSL3_VERSION)
+ {
+ version_major = SSL3_VERSION_MAJOR;
+@@ -617,6 +625,14 @@ static int ssl23_get_server_hello(SSL *s
+ if ((p[2] == SSL3_VERSION_MINOR) &&
+ !(s->options & SSL_OP_NO_SSLv3))
+ {
++#ifdef OPENSSL_FIPS
++ if(FIPS_mode())
++ {
++ SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,
++ SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
++ goto err;
++ }
++#endif
+ s->version=SSL3_VERSION;
+ s->method=SSLv3_client_method();
+ }
+diff -up openssl-1.0.0k/ssl/s23_srvr.c.fips openssl-1.0.0k/ssl/s23_srvr.c
+--- openssl-1.0.0k/ssl/s23_srvr.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s23_srvr.c 2013-02-19 20:12:54.607665104 +0100
+@@ -393,6 +393,15 @@ int ssl23_get_client_hello(SSL *s)
+ }
+ }
+
++#ifdef OPENSSL_FIPS
++ if (FIPS_mode() && (s->version < TLS1_VERSION))
++ {
++ SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,
++ SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
++ goto err;
++ }
++#endif
++
+ if (s->state == SSL23_ST_SR_CLNT_HELLO_B)
+ {
+ /* we have SSLv3/TLSv1 in an SSLv2 header
+diff -up openssl-1.0.0k/ssl/s3_clnt.c.fips openssl-1.0.0k/ssl/s3_clnt.c
+--- openssl-1.0.0k/ssl/s3_clnt.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s3_clnt.c 2013-02-19 20:12:54.608665123 +0100
+@@ -156,6 +156,10 @@
+ #include <openssl/objects.h>
+ #include <openssl/evp.h>
+ #include <openssl/md5.h>
++#ifdef OPENSSL_FIPS
++#include <openssl/fips.h>
++#endif
++
+ #ifndef OPENSSL_NO_DH
+ #include <openssl/dh.h>
+ #endif
+@@ -1559,6 +1563,8 @@ int ssl3_get_key_exchange(SSL *s)
+ q=md_buf;
+ for (num=2; num > 0; num--)
+ {
++ EVP_MD_CTX_set_flags(&md_ctx,
++ EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ EVP_DigestInit_ex(&md_ctx,(num == 2)
+ ?s->ctx->md5:s->ctx->sha1, NULL);
+ EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
+diff -up openssl-1.0.0k/ssl/s3_enc.c.fips openssl-1.0.0k/ssl/s3_enc.c
+--- openssl-1.0.0k/ssl/s3_enc.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s3_enc.c 2013-02-19 20:12:54.609665142 +0100
+@@ -170,6 +170,7 @@ static int ssl3_generate_key_block(SSL *
+ #endif
+ k=0;
+ EVP_MD_CTX_init(&m5);
++ EVP_MD_CTX_set_flags(&m5, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ EVP_MD_CTX_init(&s1);
+ for (i=0; (int)i<num; i+=MD5_DIGEST_LENGTH)
+ {
+@@ -609,6 +610,8 @@ int ssl3_digest_cached_records(SSL *s)
+ if ((mask & s->s3->tmp.new_cipher->algorithm2) && md)
+ {
+ s->s3->handshake_dgst[i]=EVP_MD_CTX_create();
++ EVP_MD_CTX_set_flags(s->s3->handshake_dgst[i],
++ EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ EVP_DigestInit_ex(s->s3->handshake_dgst[i],md,NULL);
+ EVP_DigestUpdate(s->s3->handshake_dgst[i],hdata,hdatalen);
+ }
+@@ -665,6 +668,7 @@ static int ssl3_handshake_mac(SSL *s, in
+ return 0;
+ }
+ EVP_MD_CTX_init(&ctx);
++ EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ EVP_MD_CTX_copy_ex(&ctx,d);
+ n=EVP_MD_CTX_size(&ctx);
+ if (n < 0)
+diff -up openssl-1.0.0k/ssl/s3_srvr.c.fips openssl-1.0.0k/ssl/s3_srvr.c
+--- openssl-1.0.0k/ssl/s3_srvr.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/s3_srvr.c 2013-02-19 20:12:54.609665142 +0100
+@@ -1779,6 +1779,8 @@ int ssl3_send_server_key_exchange(SSL *s
+ j=0;
+ for (num=2; num > 0; num--)
+ {
++ EVP_MD_CTX_set_flags(&md_ctx,
++ EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+ EVP_DigestInit_ex(&md_ctx,(num == 2)
+ ?s->ctx->md5:s->ctx->sha1, NULL);
+ EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
+diff -up openssl-1.0.0k/ssl/ssl_ciph.c.fips openssl-1.0.0k/ssl/ssl_ciph.c
+--- openssl-1.0.0k/ssl/ssl_ciph.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/ssl_ciph.c 2013-02-19 20:12:54.605665066 +0100
@@ -728,6 +728,9 @@ static void ssl_cipher_collect_ciphers(c
!(c->algorithm_auth & disabled_auth) &&
!(c->algorithm_enc & disabled_enc) &&
@@ -11940,10 +12052,10 @@ diff -up openssl-1.0.0f/ssl/ssl_ciph.c.fips openssl-1.0.0f/ssl/ssl_ciph.c
{
sk_SSL_CIPHER_push(cipherstack, curr->cipher);
#ifdef CIPHER_DEBUG
-diff -up openssl-1.0.0f/ssl/ssl_lib.c.fips openssl-1.0.0f/ssl/ssl_lib.c
---- openssl-1.0.0f/ssl/ssl_lib.c.fips 2011-09-26 19:04:49.000000000 +0200
-+++ openssl-1.0.0f/ssl/ssl_lib.c 2012-01-05 13:22:30.000000000 +0100
-@@ -1524,6 +1524,14 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *m
+diff -up openssl-1.0.0k/ssl/ssl_lib.c.fips openssl-1.0.0k/ssl/ssl_lib.c
+--- openssl-1.0.0k/ssl/ssl_lib.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/ssl_lib.c 2013-02-19 20:12:54.605665066 +0100
+@@ -1526,6 +1526,14 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *m
return(NULL);
}
@@ -11958,9 +12070,9 @@ diff -up openssl-1.0.0f/ssl/ssl_lib.c.fips openssl-1.0.0f/ssl/ssl_lib.c
if (SSL_get_ex_data_X509_STORE_CTX_idx() < 0)
{
SSLerr(SSL_F_SSL_CTX_NEW,SSL_R_X509_VERIFICATION_SETUP_PROBLEMS);
-diff -up openssl-1.0.0f/ssl/ssltest.c.fips openssl-1.0.0f/ssl/ssltest.c
---- openssl-1.0.0f/ssl/ssltest.c.fips 2012-01-05 13:22:29.000000000 +0100
-+++ openssl-1.0.0f/ssl/ssltest.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/ssl/ssltest.c.fips openssl-1.0.0k/ssl/ssltest.c
+--- openssl-1.0.0k/ssl/ssltest.c.fips 2013-02-19 20:12:54.542663869 +0100
++++ openssl-1.0.0k/ssl/ssltest.c 2013-02-19 20:12:54.606665085 +0100
@@ -268,6 +268,9 @@ static void sv_usage(void)
{
fprintf(stderr,"usage: ssltest [args ...]\n");
@@ -12035,124 +12147,9 @@ diff -up openssl-1.0.0f/ssl/ssltest.c.fips openssl-1.0.0f/ssl/ssltest.c
if(s->version == TLS1_VERSION)
FIPS_allow_md5(0);
# endif
-diff -up openssl-1.0.0f/ssl/s23_clnt.c.fips openssl-1.0.0f/ssl/s23_clnt.c
---- openssl-1.0.0f/ssl/s23_clnt.c.fips 2010-02-16 15:20:40.000000000 +0100
-+++ openssl-1.0.0f/ssl/s23_clnt.c 2012-01-05 13:22:30.000000000 +0100
-@@ -334,6 +334,14 @@ static int ssl23_client_hello(SSL *s)
- version_major = TLS1_VERSION_MAJOR;
- version_minor = TLS1_VERSION_MINOR;
- }
-+#ifdef OPENSSL_FIPS
-+ else if(FIPS_mode())
-+ {
-+ SSLerr(SSL_F_SSL23_CLIENT_HELLO,
-+ SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
-+ return -1;
-+ }
-+#endif
- else if (version == SSL3_VERSION)
- {
- version_major = SSL3_VERSION_MAJOR;
-@@ -617,6 +625,14 @@ static int ssl23_get_server_hello(SSL *s
- if ((p[2] == SSL3_VERSION_MINOR) &&
- !(s->options & SSL_OP_NO_SSLv3))
- {
-+#ifdef OPENSSL_FIPS
-+ if(FIPS_mode())
-+ {
-+ SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,
-+ SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
-+ goto err;
-+ }
-+#endif
- s->version=SSL3_VERSION;
- s->method=SSLv3_client_method();
- }
-diff -up openssl-1.0.0f/ssl/s23_srvr.c.fips openssl-1.0.0f/ssl/s23_srvr.c
---- openssl-1.0.0f/ssl/s23_srvr.c.fips 2010-02-16 15:20:40.000000000 +0100
-+++ openssl-1.0.0f/ssl/s23_srvr.c 2012-01-05 13:22:30.000000000 +0100
-@@ -393,6 +393,15 @@ int ssl23_get_client_hello(SSL *s)
- }
- }
-
-+#ifdef OPENSSL_FIPS
-+ if (FIPS_mode() && (s->version < TLS1_VERSION))
-+ {
-+ SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,
-+ SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
-+ goto err;
-+ }
-+#endif
-+
- if (s->state == SSL23_ST_SR_CLNT_HELLO_B)
- {
- /* we have SSLv3/TLSv1 in an SSLv2 header
-diff -up openssl-1.0.0f/ssl/s3_clnt.c.fips openssl-1.0.0f/ssl/s3_clnt.c
---- openssl-1.0.0f/ssl/s3_clnt.c.fips 2011-12-26 20:38:19.000000000 +0100
-+++ openssl-1.0.0f/ssl/s3_clnt.c 2012-01-05 13:22:30.000000000 +0100
-@@ -156,6 +156,10 @@
- #include <openssl/objects.h>
- #include <openssl/evp.h>
- #include <openssl/md5.h>
-+#ifdef OPENSSL_FIPS
-+#include <openssl/fips.h>
-+#endif
-+
- #ifndef OPENSSL_NO_DH
- #include <openssl/dh.h>
- #endif
-@@ -1550,6 +1554,8 @@ int ssl3_get_key_exchange(SSL *s)
- q=md_buf;
- for (num=2; num > 0; num--)
- {
-+ EVP_MD_CTX_set_flags(&md_ctx,
-+ EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- EVP_DigestInit_ex(&md_ctx,(num == 2)
- ?s->ctx->md5:s->ctx->sha1, NULL);
- EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-diff -up openssl-1.0.0f/ssl/s3_enc.c.fips openssl-1.0.0f/ssl/s3_enc.c
---- openssl-1.0.0f/ssl/s3_enc.c.fips 2012-01-04 16:38:54.000000000 +0100
-+++ openssl-1.0.0f/ssl/s3_enc.c 2012-01-05 13:22:30.000000000 +0100
-@@ -170,6 +170,7 @@ static int ssl3_generate_key_block(SSL *
- #endif
- k=0;
- EVP_MD_CTX_init(&m5);
-+ EVP_MD_CTX_set_flags(&m5, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- EVP_MD_CTX_init(&s1);
- for (i=0; (int)i<num; i+=MD5_DIGEST_LENGTH)
- {
-@@ -616,6 +617,8 @@ int ssl3_digest_cached_records(SSL *s)
- if ((mask & s->s3->tmp.new_cipher->algorithm2) && md)
- {
- s->s3->handshake_dgst[i]=EVP_MD_CTX_create();
-+ EVP_MD_CTX_set_flags(s->s3->handshake_dgst[i],
-+ EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- EVP_DigestInit_ex(s->s3->handshake_dgst[i],md,NULL);
- EVP_DigestUpdate(s->s3->handshake_dgst[i],hdata,hdatalen);
- }
-@@ -672,6 +675,7 @@ static int ssl3_handshake_mac(SSL *s, in
- return 0;
- }
- EVP_MD_CTX_init(&ctx);
-+ EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- EVP_MD_CTX_copy_ex(&ctx,d);
- n=EVP_MD_CTX_size(&ctx);
- if (n < 0)
-diff -up openssl-1.0.0f/ssl/s3_srvr.c.fips openssl-1.0.0f/ssl/s3_srvr.c
---- openssl-1.0.0f/ssl/s3_srvr.c.fips 2012-01-04 16:27:54.000000000 +0100
-+++ openssl-1.0.0f/ssl/s3_srvr.c 2012-01-05 13:22:30.000000000 +0100
-@@ -1770,6 +1770,8 @@ int ssl3_send_server_key_exchange(SSL *s
- j=0;
- for (num=2; num > 0; num--)
- {
-+ EVP_MD_CTX_set_flags(&md_ctx,
-+ EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
- EVP_DigestInit_ex(&md_ctx,(num == 2)
- ?s->ctx->md5:s->ctx->sha1, NULL);
- EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-diff -up openssl-1.0.0f/ssl/t1_enc.c.fips openssl-1.0.0f/ssl/t1_enc.c
---- openssl-1.0.0f/ssl/t1_enc.c.fips 2010-06-15 19:25:15.000000000 +0200
-+++ openssl-1.0.0f/ssl/t1_enc.c 2012-01-05 13:22:30.000000000 +0100
+diff -up openssl-1.0.0k/ssl/t1_enc.c.fips openssl-1.0.0k/ssl/t1_enc.c
+--- openssl-1.0.0k/ssl/t1_enc.c.fips 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/ssl/t1_enc.c 2013-02-19 20:12:54.610665161 +0100
@@ -170,6 +170,8 @@ static int tls1_P_hash(const EVP_MD *md,
HMAC_CTX_init(&ctx);
diff --git a/openssl-1.0.0d-intelopts.patch b/openssl-1.0.0k-intelopts.patch
similarity index 97%
rename from openssl-1.0.0d-intelopts.patch
rename to openssl-1.0.0k-intelopts.patch
index 6aba7b3..0cf1852 100644
--- a/openssl-1.0.0d-intelopts.patch
+++ b/openssl-1.0.0k-intelopts.patch
@@ -1,233 +1,254 @@
-diff -up openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl.intelopts openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl
---- openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl.intelopts 2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl 2011-08-24 12:50:56.000000000 +0200
+diff -up openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl.intelopts openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl
+--- openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl.intelopts 2013-02-19 21:15:39.391403202 +0100
++++ openssl-1.0.0k/crypto/aes/asm/aesni-x86_64.pl 2013-02-19 21:15:39.427403937 +0100
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/perl
-
+ #
# ====================================================================
# Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -11,10 +11,37 @@
+@@ -11,6 +11,145 @@
# OpenSSL context it's used with Intel engine, but can also be used as
- # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+ # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
# details].
+#
+# Performance.
+#
-+# To start with see corresponding paragraph in aesni-x86_64.pl...
-+# Instead of filling table similar to one found there I've chosen to
-+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
-+# The simplified table below represents 32-bit performance relative
-+# to 64-bit one in every given point. Ratios vary for different
-+# encryption modes, therefore interval values.
++# Given aes(enc|dec) instructions' latency asymptotic performance for
++# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
++# processed with 128-bit key. And given their throughput asymptotic
++# performance for parallelizable modes is 1.25 cycles per byte. Being
++# asymptotic limit it's not something you commonly achieve in reality,
++# but how close does one get? Below are results collected for
++# different modes and block sized. Pairs of numbers are for en-/
++# decryption.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
-+# 53-67% 67-84% 91-94% 95-98% 97-99.5%
++# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
++# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
++# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
++# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
++# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
++# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
+#
-+# Lower ratios for smaller block sizes are perfectly understandable,
-+# because function call overhead is higher in 32-bit mode. Largest
-+# 8-KB block performance is virtually same: 32-bit code is less than
-+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
++# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
++# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
++# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
++# The results were collected with specially crafted speed.c benchmark
++# in order to compare them with results reported in "Intel Advanced
++# Encryption Standard (AES) New Instruction Set" White Paper Revision
++# 3.0 dated May 2010. All above results are consistently better. This
++# module also provides better performance for block sizes smaller than
++# 128 bytes in points *not* represented in the above table.
++#
++# Looking at the results for 8-KB buffer.
++#
++# CFB and OFB results are far from the limit, because implementation
++# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
++# single-block aesni_encrypt, which is not the most optimal way to go.
++# CBC encrypt result is unexpectedly high and there is no documented
++# explanation for it. Seemingly there is a small penalty for feeding
++# the result back to AES unit the way it's done in CBC mode. There is
++# nothing one can do and the result appears optimal. CCM result is
++# identical to CBC, because CBC-MAC is essentially CBC encrypt without
++# saving output. CCM CTR "stays invisible," because it's neatly
++# interleaved wih CBC-MAC. This provides ~30% improvement over
++# "straghtforward" CCM implementation with CTR and CBC-MAC performed
++# disjointly. Parallelizable modes practically achieve the theoretical
++# limit.
++#
++# Looking at how results vary with buffer size.
++#
++# Curves are practically saturated at 1-KB buffer size. In most cases
++# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
++# CTR curve doesn't follow this pattern and is "slowest" changing one
++# with "256-byte" result being 87% of "8-KB." This is because overhead
++# in CTR mode is most computationally intensive. Small-block CCM
++# decrypt is slower than encrypt, because first CTR and last CBC-MAC
++# iterations can't be interleaved.
++#
++# Results for 192- and 256-bit keys.
++#
++# EVP-free results were observed to scale perfectly with number of
++# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
++# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
++# are a tad smaller, because the above mentioned penalty biases all
++# results by same constant value. In similar way function call
++# overhead affects small-block performance, as well as OFB and CFB
++# results. Differences are not large, most common coefficients are
++# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
++# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
-+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
-+# interleaves at most 6 aes[enc|dec] instructions, because there are
-+# not enough registers for 8x interleave [which should be optimal for
-+# Sandy Bridge]. Actually, performance results for 6x interleave
-+# factor presented in aesni-x86_64.pl (except for CTR) are for this
-+# module.
++# While Westmere processor features 6 cycles latency for aes[enc|dec]
++# instructions, which can be scheduled every second cycle, Sandy
++# Bridge spends 8 cycles per instruction, but it can schedule them
++# every cycle. This means that code targeting Westmere would perform
++# suboptimally on Sandy Bridge. Therefore this update.
++#
++# In addition, non-parallelizable CBC encrypt (as well as CCM) is
++# optimized. Relative improvement might appear modest, 8% on Westmere,
++# but in absolute terms it's 3.77 cycles per byte encrypted with
++# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
++# should be compared to asymptotic limits of 3.75 for Westmere and
++# 5.00 for Sandy Bridge. Actually, the fact that they get this close
++# to asymptotic limits is quite amazing. Indeed, the limit is
++# calculated as latency times number of rounds, 10 for 128-bit key,
++# and divided by 16, the number of bytes in block, or in other words
++# it accounts *solely* for aesenc instructions. But there are extra
++# instructions, and numbers so close to the asymptotic limits mean
++# that it's as if it takes as little as *one* additional cycle to
++# execute all of them. How is it possible? It is possible thanks to
++# out-of-order execution logic, which manages to overlap post-
++# processing of previous block, things like saving the output, with
++# actual encryption of current block, as well as pre-processing of
++# current block, things like fetching input and xor-ing it with
++# 0-round element of the key schedule, with actual encryption of
++# previous block. Keep this in mind...
++#
++# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
++# performance is achieved by interleaving instructions working on
++# independent blocks. In which case asymptotic limit for such modes
++# can be obtained by dividing above mentioned numbers by AES
++# instructions' interleave factor. Westmere can execute at most 3
++# instructions at a time, meaning that optimal interleave factor is 3,
++# and that's where the "magic" number of 1.25 come from. "Optimal
++# interleave factor" means that increase of interleave factor does
++# not improve performance. The formula has proven to reflect reality
++# pretty well on Westmere... Sandy Bridge on the other hand can
++# execute up to 8 AES instructions at a time, so how does varying
++# interleave factor affect the performance? Here is table for ECB
++# (numbers are cycles per byte processed with 128-bit key):
++#
++# instruction interleave factor 3x 6x 8x
++# theoretical asymptotic limit 1.67 0.83 0.625
++# measured performance for 8KB block 1.05 0.86 0.84
++#
++# "as if" interleave factor 4.7x 5.8x 6.0x
++#
++# Further data for other parallelizable modes:
++#
++# CBC decrypt 1.16 0.93 0.93
++# CTR 1.14 0.91 n/a
++#
++# Well, given 3x column it's probably inappropriate to call the limit
++# asymptotic, if it can be surpassed, isn't it? What happens there?
++# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
++# magic is responsible for this. Processor overlaps not only the
++# additional instructions with AES ones, but even AES instuctions
++# processing adjacent triplets of independent blocks. In the 6x case
++# additional instructions still claim disproportionally small amount
++# of additional cycles, but in 8x case number of instructions must be
++# a tad too high for out-of-order logic to cope with, and AES unit
++# remains underutilized... As you can see 8x interleave is hardly
++# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
++# utilizies 6x interleave because of limited register bank capacity.
++#
++# Higher interleave factors do have negative impact on Westmere
++# performance. While for ECB mode it's negligible ~1.5%, other
++# parallelizables perform ~5% worse, which is outweighed by ~25%
++# improvement on Sandy Bridge. To balance regression on Westmere
++# CTR mode was implemented with 6x aesenc interleave factor.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
- # crypto/aes/asm/aes-586.pl:-)
-+$inline=1; # inline _aesni_[en|de]crypt
-
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- push(@INC,"${dir}","${dir}../../perlasm");
-@@ -22,7 +49,8 @@ require "x86asm.pl";
-
- &asm_init($ARGV[0],$0);
+@@ -29,7 +168,7 @@ die "can't locate x86_64-xlate.pl";
--$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
-+if ($PREFIX eq "aesni") { $movekey=*movups; }
-+else { $movekey=*movups; }
+ open STDOUT,"| $^X $xlate $flavour $output";
- $len="eax";
- $rounds="ecx";
-@@ -32,114 +60,144 @@ $out="edi";
- $rounds_="ebx"; # backup copy for $rounds
- $key_="ebp"; # backup copy for $key
+-$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
++$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
+ @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
--$inout0="xmm0";
--$inout1="xmm1";
--$inout2="xmm2";
--$rndkey0="xmm3";
--$rndkey1="xmm4";
--$ivec="xmm5";
--$in0="xmm6";
--$in1="xmm7"; $inout3="xmm7";
--
-+$rndkey0="xmm0";
-+$rndkey1="xmm1";
-+$inout0="xmm2";
-+$inout1="xmm3";
-+$inout2="xmm4";
-+$inout3="xmm5"; $in1="xmm5";
-+$inout4="xmm6"; $in0="xmm6";
-+$inout5="xmm7"; $ivec="xmm7";
-+
-+# AESNI extenstion
-+sub aeskeygenassist
-+{ my($dst,$src,$imm)=@_;
-+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+ { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
-+}
-+sub aescommon
-+{ my($opcodelet,$dst,$src)=@_;
-+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+ { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
-+}
-+sub aesimc { aescommon(0xdb, at _); }
-+sub aesenc { aescommon(0xdc, at _); }
-+sub aesenclast { aescommon(0xdd, at _); }
-+sub aesdec { aescommon(0xde, at _); }
-+sub aesdeclast { aescommon(0xdf, at _); }
-+
- # Inline version of internal aesni_[en|de]crypt1
-+{ my $sn;
- sub aesni_inline_generate1
--{ my $p=shift;
-+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
-+ $sn++;
+@@ -41,18 +180,20 @@ $inp="%rdi";
+ $out="%rsi";
+ $len="%rdx";
+ $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
+-$ivp="%r8"; # cbc
++$ivp="%r8"; # cbc, ctr, ...
- &$movekey ($rndkey0,&QWP(0,$key));
- &$movekey ($rndkey1,&QWP(16,$key));
-+ &xorps ($ivec,$rndkey0) if (defined($ivec));
- &lea ($key,&DWP(32,$key));
-- &pxor ($inout0,$rndkey0);
-- &set_label("${p}1_loop");
-- eval"&aes${p} ($inout0,$rndkey1)";
-+ &xorps ($inout,$ivec) if (defined($ivec));
-+ &xorps ($inout,$rndkey0) if (!defined($ivec));
-+ &set_label("${p}1_loop_$sn");
-+ eval"&aes${p} ($inout,$rndkey1)";
- &dec ($rounds);
- &$movekey ($rndkey1,&QWP(0,$key));
- &lea ($key,&DWP(16,$key));
-- &jnz (&label("${p}1_loop"));
-- eval"&aes${p}last ($inout0,$rndkey1)";
--}
-+ &jnz (&label("${p}1_loop_$sn"));
-+ eval"&aes${p}last ($inout,$rndkey1)";
-+}}
+ $rnds_="%r10d"; # backup copy for $rounds
+ $key_="%r11"; # backup copy for $key
- sub aesni_generate1 # fully unrolled loop
--{ my $p=shift;
-+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+ # %xmm register layout
+-$inout0="%xmm0"; $inout1="%xmm1";
+-$inout2="%xmm2"; $inout3="%xmm3";
+-$rndkey0="%xmm4"; $rndkey1="%xmm5";
++$rndkey0="%xmm0"; $rndkey1="%xmm1";
++$inout0="%xmm2"; $inout1="%xmm3";
++$inout2="%xmm4"; $inout3="%xmm5";
++$inout4="%xmm6"; $inout5="%xmm7";
++$inout6="%xmm8"; $inout7="%xmm9";
- &function_begin_B("_aesni_${p}rypt1");
-- &$movekey ($rndkey0,&QWP(0,$key));
-+ &movups ($rndkey0,&QWP(0,$key));
- &$movekey ($rndkey1,&QWP(0x10,$key));
-- &cmp ($rounds,11);
-- &pxor ($inout0,$rndkey0);
-+ &xorps ($inout,$rndkey0);
- &$movekey ($rndkey0,&QWP(0x20,$key));
- &lea ($key,&DWP(0x30,$key));
-+ &cmp ($rounds,11);
- &jb (&label("${p}128"));
- &lea ($key,&DWP(0x20,$key));
- &je (&label("${p}192"));
- &lea ($key,&DWP(0x20,$key));
-- eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout,$rndkey1)";
- &$movekey ($rndkey1,&QWP(-0x40,$key));
-- eval"&aes${p} ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout,$rndkey0)";
- &$movekey ($rndkey0,&QWP(-0x30,$key));
- &set_label("${p}192");
-- eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout,$rndkey1)";
- &$movekey ($rndkey1,&QWP(-0x20,$key));
-- eval"&aes${p} ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout,$rndkey0)";
- &$movekey ($rndkey0,&QWP(-0x10,$key));
- &set_label("${p}128");
-- eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout,$rndkey1)";
- &$movekey ($rndkey1,&QWP(0,$key));
-- eval"&aes${p} ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout,$rndkey0)";
- &$movekey ($rndkey0,&QWP(0x10,$key));
-- eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout,$rndkey1)";
- &$movekey ($rndkey1,&QWP(0x20,$key));
-- eval"&aes${p} ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout,$rndkey0)";
- &$movekey ($rndkey0,&QWP(0x30,$key));
-- eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout,$rndkey1)";
- &$movekey ($rndkey1,&QWP(0x40,$key));
-- eval"&aes${p} ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout,$rndkey0)";
- &$movekey ($rndkey0,&QWP(0x50,$key));
-- eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout,$rndkey1)";
- &$movekey ($rndkey1,&QWP(0x60,$key));
-- eval"&aes${p} ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout,$rndkey0)";
- &$movekey ($rndkey0,&QWP(0x70,$key));
-- eval"&aes${p} ($inout0,$rndkey1)";
-- eval"&aes${p}last ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout,$rndkey1)";
-+ eval"&aes${p}last ($inout,$rndkey0)";
- &ret();
- &function_end_B("_aesni_${p}rypt1");
+-$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt
+-$in1="%xmm8"; $in2="%xmm9";
++$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
++$in0="%xmm8"; $iv="%xmm9";
+
+ # Inline version of internal aesni_[en|de]crypt1.
+ #
+@@ -60,20 +201,29 @@ $in1="%xmm8"; $in2="%xmm9";
+ # cycles which take care of loop variables...
+ { my $sn;
+ sub aesni_generate1 {
+-my ($p,$key,$rounds)=@_;
++my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+ ++$sn;
+ $code.=<<___;
+ $movkey ($key),$rndkey0
+ $movkey 16($key),$rndkey1
++___
++$code.=<<___ if (defined($ivec));
++ xorps $rndkey0,$ivec
+ lea 32($key),$key
+- pxor $rndkey0,$inout0
++ xorps $ivec,$inout
++___
++$code.=<<___ if (!defined($ivec));
++ lea 32($key),$key
++ xorps $rndkey0,$inout
++___
++$code.=<<___;
+ .Loop_${p}1_$sn:
+- aes${p} $rndkey1,$inout0
++ aes${p} $rndkey1,$inout
+ dec $rounds
+ $movkey ($key),$rndkey1
+ lea 16($key),$key
+ jnz .Loop_${p}1_$sn # loop body is 16 bytes
+- aes${p}last $rndkey1,$inout0
++ aes${p}last $rndkey1,$inout
+ ___
+ }}
+ # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
+@@ -86,7 +236,7 @@ $code.=<<___;
+ .align 16
+ ${PREFIX}_encrypt:
+ movups ($inp),$inout0 # load input
+- mov 240($key),$rounds # pull $rounds
++ mov 240($key),$rounds # key->rounds
+ ___
+ &aesni_generate1("enc",$key,$rounds);
+ $code.=<<___;
+@@ -99,7 +249,7 @@ $code.=<<___;
+ .align 16
+ ${PREFIX}_decrypt:
+ movups ($inp),$inout0 # load input
+- mov 240($key),$rounds # pull $rounds
++ mov 240($key),$rounds # key->rounds
+ ___
+ &aesni_generate1("dec",$key,$rounds);
+ $code.=<<___;
+@@ -109,16 +259,16 @@ $code.=<<___;
+ ___
}
--
-+
- # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
--# &aesni_generate1("dec");
-+&aesni_generate1("enc") if (!$inline);
- &function_begin_B("${PREFIX}_encrypt");
- &mov ("eax",&wparam(0));
- &mov ($key,&wparam(2));
- &movups ($inout0,&QWP(0,"eax"));
- &mov ($rounds,&DWP(240,$key));
- &mov ("eax",&wparam(1));
-- &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
-+ if ($inline)
-+ { &aesni_inline_generate1("enc"); }
-+ else
-+ { &call ("_aesni_encrypt1"); }
- &movups (&QWP(0,"eax"),$inout0);
- &ret ();
- &function_end_B("${PREFIX}_encrypt");
-
- # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
--# &aesni_generate1("dec");
-+&aesni_generate1("dec") if(!$inline);
- &function_begin_B("${PREFIX}_decrypt");
- &mov ("eax",&wparam(0));
- &mov ($key,&wparam(2));
- &movups ($inout0,&QWP(0,"eax"));
- &mov ($rounds,&DWP(240,$key));
- &mov ("eax",&wparam(1));
-- &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1");
-+ if ($inline)
-+ { &aesni_inline_generate1("dec"); }
-+ else
-+ { &call ("_aesni_decrypt1"); }
- &movups (&QWP(0,"eax"),$inout0);
- &ret ();
- &function_end_B("${PREFIX}_decrypt");
--
+
-# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
-# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
-# latency is 6, it turned out that it can be scheduled only every
-# *second* cycle. Thus 3x interleave is the one providing optimal
-+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
@@ -241,2081 +262,2060 @@ diff -up openssl-1.0.0d/crypto/aes/asm/aesni-x86.pl.intelopts openssl-1.0.0d/cry
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
-+# new processor is therefore 8x, but it's unfeasible to accommodate it
-+# in XMM registers addreassable in 32-bit mode and therefore 6x is
-+# used instead...
-+
- sub aesni_generate3
- { my $p=shift;
++# new processor is therefore 8x...
+ sub aesni_generate3 {
+ my $dir=shift;
+ # As already mentioned it takes in $key and $rounds, which are *not*
+@@ -131,25 +281,25 @@ _aesni_${dir}rypt3:
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+- pxor $rndkey0,$inout0
+- pxor $rndkey0,$inout1
+- pxor $rndkey0,$inout2
++ xorps $rndkey0,$inout0
++ xorps $rndkey0,$inout1
++ xorps $rndkey0,$inout2
++ $movkey ($key),$rndkey0
-@@ -148,24 +206,24 @@ sub aesni_generate3
- &shr ($rounds,1);
- &$movekey ($rndkey1,&QWP(16,$key));
- &lea ($key,&DWP(32,$key));
-- &pxor ($inout0,$rndkey0);
-+ &xorps ($inout0,$rndkey0);
- &pxor ($inout1,$rndkey0);
- &pxor ($inout2,$rndkey0);
-- &jmp (&label("${p}3_loop"));
-- &set_label("${p}3_loop",16);
-- eval"&aes${p} ($inout0,$rndkey1)";
- &$movekey ($rndkey0,&QWP(0,$key));
-+
-+ &set_label("${p}3_loop");
-+ eval"&aes${p} ($inout0,$rndkey1)";
- eval"&aes${p} ($inout1,$rndkey1)";
- &dec ($rounds);
- eval"&aes${p} ($inout2,$rndkey1)";
- &$movekey ($rndkey1,&QWP(16,$key));
- eval"&aes${p} ($inout0,$rndkey0)";
-- &lea ($key,&DWP(32,$key));
- eval"&aes${p} ($inout1,$rndkey0)";
-+ &lea ($key,&DWP(32,$key));
- eval"&aes${p} ($inout2,$rndkey0)";
-+ &$movekey ($rndkey0,&QWP(0,$key));
- &jnz (&label("${p}3_loop"));
- eval"&aes${p} ($inout0,$rndkey1)";
-- &$movekey ($rndkey0,&QWP(0,$key));
- eval"&aes${p} ($inout1,$rndkey1)";
- eval"&aes${p} ($inout2,$rndkey1)";
- eval"&aes${p}last ($inout0,$rndkey0)";
-@@ -187,27 +245,28 @@ sub aesni_generate4
- &$movekey ($rndkey1,&QWP(16,$key));
- &shr ($rounds,1);
- &lea ($key,&DWP(32,$key));
-- &pxor ($inout0,$rndkey0);
-+ &xorps ($inout0,$rndkey0);
- &pxor ($inout1,$rndkey0);
- &pxor ($inout2,$rndkey0);
- &pxor ($inout3,$rndkey0);
-- &jmp (&label("${p}3_loop"));
-- &set_label("${p}3_loop",16);
-- eval"&aes${p} ($inout0,$rndkey1)";
- &$movekey ($rndkey0,&QWP(0,$key));
-+
-+ &set_label("${p}4_loop");
-+ eval"&aes${p} ($inout0,$rndkey1)";
- eval"&aes${p} ($inout1,$rndkey1)";
- &dec ($rounds);
- eval"&aes${p} ($inout2,$rndkey1)";
- eval"&aes${p} ($inout3,$rndkey1)";
- &$movekey ($rndkey1,&QWP(16,$key));
- eval"&aes${p} ($inout0,$rndkey0)";
-- &lea ($key,&DWP(32,$key));
- eval"&aes${p} ($inout1,$rndkey0)";
-+ &lea ($key,&DWP(32,$key));
- eval"&aes${p} ($inout2,$rndkey0)";
- eval"&aes${p} ($inout3,$rndkey0)";
-- &jnz (&label("${p}3_loop"));
-+ &$movekey ($rndkey0,&QWP(0,$key));
-+ &jnz (&label("${p}4_loop"));
-+
- eval"&aes${p} ($inout0,$rndkey1)";
-- &$movekey ($rndkey0,&QWP(0,$key));
- eval"&aes${p} ($inout1,$rndkey1)";
- eval"&aes${p} ($inout2,$rndkey1)";
- eval"&aes${p} ($inout3,$rndkey1)";
-@@ -218,12 +277,76 @@ sub aesni_generate4
- &ret();
- &function_end_B("_aesni_${p}rypt4");
- }
-+
-+sub aesni_generate6
-+{ my $p=shift;
-+
-+ &function_begin_B("_aesni_${p}rypt6");
-+ &static_label("_aesni_${p}rypt6_enter");
-+ &$movekey ($rndkey0,&QWP(0,$key));
-+ &shr ($rounds,1);
-+ &$movekey ($rndkey1,&QWP(16,$key));
-+ &lea ($key,&DWP(32,$key));
-+ &xorps ($inout0,$rndkey0);
-+ &pxor ($inout1,$rndkey0); # pxor does better here
-+ eval"&aes${p} ($inout0,$rndkey1)";
-+ &pxor ($inout2,$rndkey0);
-+ eval"&aes${p} ($inout1,$rndkey1)";
-+ &pxor ($inout3,$rndkey0);
-+ &dec ($rounds);
-+ eval"&aes${p} ($inout2,$rndkey1)";
-+ &pxor ($inout4,$rndkey0);
-+ eval"&aes${p} ($inout3,$rndkey1)";
-+ &pxor ($inout5,$rndkey0);
-+ eval"&aes${p} ($inout4,$rndkey1)";
-+ &$movekey ($rndkey0,&QWP(0,$key));
-+ eval"&aes${p} ($inout5,$rndkey1)";
-+ &jmp (&label("_aesni_${p}rypt6_enter"));
+ .L${dir}_loop3:
+ aes${dir} $rndkey1,$inout0
+- $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+- aes${dir} $rndkey0,$inout0
+ $movkey 16($key),$rndkey1
++ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
++ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop3
+
+ aes${dir} $rndkey1,$inout0
+- $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir}last $rndkey0,$inout0
+@@ -175,28 +325,28 @@ _aesni_${dir}rypt4:
+ shr \$1,$rounds
+ $movkey 16($key),$rndkey1
+ lea 32($key),$key
+- pxor $rndkey0,$inout0
+- pxor $rndkey0,$inout1
+- pxor $rndkey0,$inout2
+- pxor $rndkey0,$inout3
++ xorps $rndkey0,$inout0
++ xorps $rndkey0,$inout1
++ xorps $rndkey0,$inout2
++ xorps $rndkey0,$inout3
++ $movkey ($key),$rndkey0
+
+ .L${dir}_loop4:
+ aes${dir} $rndkey1,$inout0
+- $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout1
+ dec $rounds
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+- aes${dir} $rndkey0,$inout0
+ $movkey 16($key),$rndkey1
++ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ lea 32($key),$key
+ aes${dir} $rndkey0,$inout2
+ aes${dir} $rndkey0,$inout3
++ $movkey ($key),$rndkey0
+ jnz .L${dir}_loop4
+
+ aes${dir} $rndkey1,$inout0
+- $movkey ($key),$rndkey0
+ aes${dir} $rndkey1,$inout1
+ aes${dir} $rndkey1,$inout2
+ aes${dir} $rndkey1,$inout3
+@@ -208,12 +358,158 @@ _aesni_${dir}rypt4:
+ .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
+ ___
+ }
++sub aesni_generate6 {
++my $dir=shift;
++# As already mentioned it takes in $key and $rounds, which are *not*
++# preserved. $inout[0-5] is cipher/clear text...
++$code.=<<___;
++.type _aesni_${dir}rypt6,\@abi-omnipotent
++.align 16
++_aesni_${dir}rypt6:
++ $movkey ($key),$rndkey0
++ shr \$1,$rounds
++ $movkey 16($key),$rndkey1
++ lea 32($key),$key
++ xorps $rndkey0,$inout0
++ pxor $rndkey0,$inout1
++ aes${dir} $rndkey1,$inout0
++ pxor $rndkey0,$inout2
++ aes${dir} $rndkey1,$inout1
++ pxor $rndkey0,$inout3
++ aes${dir} $rndkey1,$inout2
++ pxor $rndkey0,$inout4
++ aes${dir} $rndkey1,$inout3
++ pxor $rndkey0,$inout5
++ dec $rounds
++ aes${dir} $rndkey1,$inout4
++ $movkey ($key),$rndkey0
++ aes${dir} $rndkey1,$inout5
++ jmp .L${dir}_loop6_enter
++.align 16
++.L${dir}_loop6:
++ aes${dir} $rndkey1,$inout0
++ aes${dir} $rndkey1,$inout1
++ dec $rounds
++ aes${dir} $rndkey1,$inout2
++ aes${dir} $rndkey1,$inout3
++ aes${dir} $rndkey1,$inout4
++ aes${dir} $rndkey1,$inout5
++.L${dir}_loop6_enter: # happens to be 16-byte aligned
++ $movkey 16($key),$rndkey1
++ aes${dir} $rndkey0,$inout0
++ aes${dir} $rndkey0,$inout1
++ lea 32($key),$key
++ aes${dir} $rndkey0,$inout2
++ aes${dir} $rndkey0,$inout3
++ aes${dir} $rndkey0,$inout4
++ aes${dir} $rndkey0,$inout5
++ $movkey ($key),$rndkey0
++ jnz .L${dir}_loop6
+
-+ &set_label("${p}6_loop",16);
-+ eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout1,$rndkey1)";
-+ &dec ($rounds);
-+ eval"&aes${p} ($inout2,$rndkey1)";
-+ eval"&aes${p} ($inout3,$rndkey1)";
-+ eval"&aes${p} ($inout4,$rndkey1)";
-+ eval"&aes${p} ($inout5,$rndkey1)";
-+ &set_label("_aesni_${p}rypt6_enter",16);
-+ &$movekey ($rndkey1,&QWP(16,$key));
-+ eval"&aes${p} ($inout0,$rndkey0)";
-+ eval"&aes${p} ($inout1,$rndkey0)";
-+ &lea ($key,&DWP(32,$key));
-+ eval"&aes${p} ($inout2,$rndkey0)";
-+ eval"&aes${p} ($inout3,$rndkey0)";
-+ eval"&aes${p} ($inout4,$rndkey0)";
-+ eval"&aes${p} ($inout5,$rndkey0)";
-+ &$movekey ($rndkey0,&QWP(0,$key));
-+ &jnz (&label("${p}6_loop"));
++ aes${dir} $rndkey1,$inout0
++ aes${dir} $rndkey1,$inout1
++ aes${dir} $rndkey1,$inout2
++ aes${dir} $rndkey1,$inout3
++ aes${dir} $rndkey1,$inout4
++ aes${dir} $rndkey1,$inout5
++ aes${dir}last $rndkey0,$inout0
++ aes${dir}last $rndkey0,$inout1
++ aes${dir}last $rndkey0,$inout2
++ aes${dir}last $rndkey0,$inout3
++ aes${dir}last $rndkey0,$inout4
++ aes${dir}last $rndkey0,$inout5
++ ret
++.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
++___
++}
++sub aesni_generate8 {
++my $dir=shift;
++# As already mentioned it takes in $key and $rounds, which are *not*
++# preserved. $inout[0-7] is cipher/clear text...
++$code.=<<___;
++.type _aesni_${dir}rypt8,\@abi-omnipotent
++.align 16
++_aesni_${dir}rypt8:
++ $movkey ($key),$rndkey0
++ shr \$1,$rounds
++ $movkey 16($key),$rndkey1
++ lea 32($key),$key
++ xorps $rndkey0,$inout0
++ xorps $rndkey0,$inout1
++ aes${dir} $rndkey1,$inout0
++ pxor $rndkey0,$inout2
++ aes${dir} $rndkey1,$inout1
++ pxor $rndkey0,$inout3
++ aes${dir} $rndkey1,$inout2
++ pxor $rndkey0,$inout4
++ aes${dir} $rndkey1,$inout3
++ pxor $rndkey0,$inout5
++ dec $rounds
++ aes${dir} $rndkey1,$inout4
++ pxor $rndkey0,$inout6
++ aes${dir} $rndkey1,$inout5
++ pxor $rndkey0,$inout7
++ $movkey ($key),$rndkey0
++ aes${dir} $rndkey1,$inout6
++ aes${dir} $rndkey1,$inout7
++ $movkey 16($key),$rndkey1
++ jmp .L${dir}_loop8_enter
++.align 16
++.L${dir}_loop8:
++ aes${dir} $rndkey1,$inout0
++ aes${dir} $rndkey1,$inout1
++ dec $rounds
++ aes${dir} $rndkey1,$inout2
++ aes${dir} $rndkey1,$inout3
++ aes${dir} $rndkey1,$inout4
++ aes${dir} $rndkey1,$inout5
++ aes${dir} $rndkey1,$inout6
++ aes${dir} $rndkey1,$inout7
++ $movkey 16($key),$rndkey1
++.L${dir}_loop8_enter: # happens to be 16-byte aligned
++ aes${dir} $rndkey0,$inout0
++ aes${dir} $rndkey0,$inout1
++ lea 32($key),$key
++ aes${dir} $rndkey0,$inout2
++ aes${dir} $rndkey0,$inout3
++ aes${dir} $rndkey0,$inout4
++ aes${dir} $rndkey0,$inout5
++ aes${dir} $rndkey0,$inout6
++ aes${dir} $rndkey0,$inout7
++ $movkey ($key),$rndkey0
++ jnz .L${dir}_loop8
+
-+ eval"&aes${p} ($inout0,$rndkey1)";
-+ eval"&aes${p} ($inout1,$rndkey1)";
-+ eval"&aes${p} ($inout2,$rndkey1)";
-+ eval"&aes${p} ($inout3,$rndkey1)";
-+ eval"&aes${p} ($inout4,$rndkey1)";
-+ eval"&aes${p} ($inout5,$rndkey1)";
-+ eval"&aes${p}last ($inout0,$rndkey0)";
-+ eval"&aes${p}last ($inout1,$rndkey0)";
-+ eval"&aes${p}last ($inout2,$rndkey0)";
-+ eval"&aes${p}last ($inout3,$rndkey0)";
-+ eval"&aes${p}last ($inout4,$rndkey0)";
-+ eval"&aes${p}last ($inout5,$rndkey0)";
-+ &ret();
-+ &function_end_B("_aesni_${p}rypt6");
++ aes${dir} $rndkey1,$inout0
++ aes${dir} $rndkey1,$inout1
++ aes${dir} $rndkey1,$inout2
++ aes${dir} $rndkey1,$inout3
++ aes${dir} $rndkey1,$inout4
++ aes${dir} $rndkey1,$inout5
++ aes${dir} $rndkey1,$inout6
++ aes${dir} $rndkey1,$inout7
++ aes${dir}last $rndkey0,$inout0
++ aes${dir}last $rndkey0,$inout1
++ aes${dir}last $rndkey0,$inout2
++ aes${dir}last $rndkey0,$inout3
++ aes${dir}last $rndkey0,$inout4
++ aes${dir}last $rndkey0,$inout5
++ aes${dir}last $rndkey0,$inout6
++ aes${dir}last $rndkey0,$inout7
++ ret
++.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
++___
+}
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
--
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
-+
++&aesni_generate8("enc") if ($PREFIX eq "aesni");
++&aesni_generate8("dec");
+
if ($PREFIX eq "aesni") {
-+######################################################################
++########################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
- # size_t length, const AES_KEY *key,
- # int enc);
-@@ -232,62 +355,93 @@ if ($PREFIX eq "aesni") {
- &mov ($out,&wparam(1));
- &mov ($len,&wparam(2));
- &mov ($key,&wparam(3));
-- &mov ($rounds,&wparam(4));
-- &cmp ($len,16);
-- &jb (&label("ecb_ret"));
-+ &mov ($rounds_,&wparam(4));
- &and ($len,-16);
-- &test ($rounds,$rounds)
-+ &jz (&label("ecb_ret"));
- &mov ($rounds,&DWP(240,$key));
-+ &test ($rounds_,$rounds_);
-+ &jz (&label("ecb_decrypt"));
-+
- &mov ($key_,$key); # backup $key
- &mov ($rounds_,$rounds); # backup $rounds
-- &jz (&label("ecb_decrypt"));
-+ &cmp ($len,0x60);
-+ &jb (&label("ecb_enc_tail"));
-
-- &sub ($len,0x40);
-- &jbe (&label("ecb_enc_tail"));
-- &jmp (&label("ecb_enc_loop3"));
-+ &movdqu ($inout0,&QWP(0,$inp));
-+ &movdqu ($inout1,&QWP(0x10,$inp));
-+ &movdqu ($inout2,&QWP(0x20,$inp));
-+ &movdqu ($inout3,&QWP(0x30,$inp));
-+ &movdqu ($inout4,&QWP(0x40,$inp));
-+ &movdqu ($inout5,&QWP(0x50,$inp));
-+ &lea ($inp,&DWP(0x60,$inp));
-+ &sub ($len,0x60);
-+ &jmp (&label("ecb_enc_loop6_enter"));
+ # size_t length, const AES_KEY *key,
+ # int enc);
+@@ -222,54 +518,98 @@ $code.=<<___;
+ .type aesni_ecb_encrypt,\@function,5
+ .align 16
+ aesni_ecb_encrypt:
+- cmp \$16,$len # check length
+- jb .Lecb_ret
+-
+- mov 240($key),$rounds # pull $rounds
+ and \$-16,$len
++ jz .Lecb_ret
+
-+&set_label("ecb_enc_loop6",16);
-+ &movups (&QWP(0,$out),$inout0);
-+ &movdqu ($inout0,&QWP(0,$inp));
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &movdqu ($inout1,&QWP(0x10,$inp));
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &movdqu ($inout2,&QWP(0x20,$inp));
-+ &movups (&QWP(0x30,$out),$inout3);
-+ &movdqu ($inout3,&QWP(0x30,$inp));
-+ &movups (&QWP(0x40,$out),$inout4);
-+ &movdqu ($inout4,&QWP(0x40,$inp));
-+ &movups (&QWP(0x50,$out),$inout5);
-+ &lea ($out,&DWP(0x60,$out));
-+ &movdqu ($inout5,&QWP(0x50,$inp));
-+ &lea ($inp,&DWP(0x60,$inp));
-+&set_label("ecb_enc_loop6_enter");
++ mov 240($key),$rounds # key->rounds
++ $movkey ($key),$rndkey0
+ mov $key,$key_ # backup $key
+- test %r8d,%r8d # 5th argument
+ mov $rounds,$rnds_ # backup $rounds
++ test %r8d,%r8d # 5th argument
+ jz .Lecb_decrypt
+ #--------------------------- ECB ENCRYPT ------------------------------#
+- sub \$0x40,$len
+- jbe .Lecb_enc_tail
+- jmp .Lecb_enc_loop3
++ cmp \$0x80,$len
++ jb .Lecb_enc_tail
+
-+ &call ("_aesni_encrypt6");
-
--&set_label("ecb_enc_loop3",16);
-- &movups ($inout0,&QWP(0,$inp));
-- &movups ($inout1,&QWP(0x10,$inp));
-- &movups ($inout2,&QWP(0x20,$inp));
-- &call ("_aesni_encrypt3");
-- &sub ($len,0x30);
-- &lea ($inp,&DWP(0x30,$inp));
-- &lea ($out,&DWP(0x30,$out));
-- &movups (&QWP(-0x30,$out),$inout0);
- &mov ($key,$key_); # restore $key
-- &movups (&QWP(-0x20,$out),$inout1);
- &mov ($rounds,$rounds_); # restore $rounds
-- &movups (&QWP(-0x10,$out),$inout2);
-- &ja (&label("ecb_enc_loop3"));
-+ &sub ($len,0x60);
-+ &jnc (&label("ecb_enc_loop6"));
-
--&set_label("ecb_enc_tail");
-- &add ($len,0x40);
-+ &movups (&QWP(0,$out),$inout0);
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &movups (&QWP(0x30,$out),$inout3);
-+ &movups (&QWP(0x40,$out),$inout4);
-+ &movups (&QWP(0x50,$out),$inout5);
-+ &lea ($out,&DWP(0x60,$out));
-+ &add ($len,0x60);
- &jz (&label("ecb_ret"));
-
-- &cmp ($len,0x10);
-+&set_label("ecb_enc_tail");
- &movups ($inout0,&QWP(0,$inp));
-- &je (&label("ecb_enc_one"));
- &cmp ($len,0x20);
-+ &jb (&label("ecb_enc_one"));
- &movups ($inout1,&QWP(0x10,$inp));
- &je (&label("ecb_enc_two"));
-- &cmp ($len,0x30);
- &movups ($inout2,&QWP(0x20,$inp));
-- &je (&label("ecb_enc_three"));
-+ &cmp ($len,0x40);
-+ &jb (&label("ecb_enc_three"));
- &movups ($inout3,&QWP(0x30,$inp));
-- &call ("_aesni_encrypt4");
-+ &je (&label("ecb_enc_four"));
-+ &movups ($inout4,&QWP(0x40,$inp));
-+ &xorps ($inout5,$inout5);
-+ &call ("_aesni_encrypt6");
- &movups (&QWP(0,$out),$inout0);
- &movups (&QWP(0x10,$out),$inout1);
- &movups (&QWP(0x20,$out),$inout2);
- &movups (&QWP(0x30,$out),$inout3);
-+ &movups (&QWP(0x40,$out),$inout4);
- jmp (&label("ecb_ret"));
-
- &set_label("ecb_enc_one",16);
-- &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
-+ if ($inline)
-+ { &aesni_inline_generate1("enc"); }
-+ else
-+ { &call ("_aesni_encrypt1"); }
- &movups (&QWP(0,$out),$inout0);
- &jmp (&label("ecb_ret"));
-
- &set_label("ecb_enc_two",16);
-+ &xorps ($inout2,$inout2);
- &call ("_aesni_encrypt3");
- &movups (&QWP(0,$out),$inout0);
- &movups (&QWP(0x10,$out),$inout1);
-@@ -300,53 +454,95 @@ if ($PREFIX eq "aesni") {
- &movups (&QWP(0x20,$out),$inout2);
- &jmp (&label("ecb_ret"));
++ movdqu ($inp),$inout0
++ movdqu 0x10($inp),$inout1
++ movdqu 0x20($inp),$inout2
++ movdqu 0x30($inp),$inout3
++ movdqu 0x40($inp),$inout4
++ movdqu 0x50($inp),$inout5
++ movdqu 0x60($inp),$inout6
++ movdqu 0x70($inp),$inout7
++ lea 0x80($inp),$inp
++ sub \$0x80,$len
++ jmp .Lecb_enc_loop8_enter
+ .align 16
+-.Lecb_enc_loop3:
+- movups ($inp),$inout0
+- movups 0x10($inp),$inout1
+- movups 0x20($inp),$inout2
+- call _aesni_encrypt3
+- sub \$0x30,$len
+- lea 0x30($inp),$inp
+- lea 0x30($out),$out
+- movups $inout0,-0x30($out)
+- mov $rnds_,$rounds # restore $rounds
+- movups $inout1,-0x20($out)
++.Lecb_enc_loop8:
++ movups $inout0,($out)
+ mov $key_,$key # restore $key
+- movups $inout2,-0x10($out)
+- ja .Lecb_enc_loop3
++ movdqu ($inp),$inout0
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout1,0x10($out)
++ movdqu 0x10($inp),$inout1
++ movups $inout2,0x20($out)
++ movdqu 0x20($inp),$inout2
++ movups $inout3,0x30($out)
++ movdqu 0x30($inp),$inout3
++ movups $inout4,0x40($out)
++ movdqu 0x40($inp),$inout4
++ movups $inout5,0x50($out)
++ movdqu 0x50($inp),$inout5
++ movups $inout6,0x60($out)
++ movdqu 0x60($inp),$inout6
++ movups $inout7,0x70($out)
++ lea 0x80($out),$out
++ movdqu 0x70($inp),$inout7
++ lea 0x80($inp),$inp
++.Lecb_enc_loop8_enter:
-+&set_label("ecb_enc_four",16);
-+ &call ("_aesni_encrypt4");
-+ &movups (&QWP(0,$out),$inout0);
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &movups (&QWP(0x30,$out),$inout3);
-+ &jmp (&label("ecb_ret"));
-+######################################################################
- &set_label("ecb_decrypt",16);
-- &sub ($len,0x40);
-- &jbe (&label("ecb_dec_tail"));
-- &jmp (&label("ecb_dec_loop3"));
-+ &mov ($key_,$key); # backup $key
-+ &mov ($rounds_,$rounds); # backup $rounds
-+ &cmp ($len,0x60);
-+ &jb (&label("ecb_dec_tail"));
-+
-+ &movdqu ($inout0,&QWP(0,$inp));
-+ &movdqu ($inout1,&QWP(0x10,$inp));
-+ &movdqu ($inout2,&QWP(0x20,$inp));
-+ &movdqu ($inout3,&QWP(0x30,$inp));
-+ &movdqu ($inout4,&QWP(0x40,$inp));
-+ &movdqu ($inout5,&QWP(0x50,$inp));
-+ &lea ($inp,&DWP(0x60,$inp));
-+ &sub ($len,0x60);
-+ &jmp (&label("ecb_dec_loop6_enter"));
+-.Lecb_enc_tail:
+- add \$0x40,$len
++ call _aesni_encrypt8
+
-+&set_label("ecb_dec_loop6",16);
-+ &movups (&QWP(0,$out),$inout0);
-+ &movdqu ($inout0,&QWP(0,$inp));
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &movdqu ($inout1,&QWP(0x10,$inp));
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &movdqu ($inout2,&QWP(0x20,$inp));
-+ &movups (&QWP(0x30,$out),$inout3);
-+ &movdqu ($inout3,&QWP(0x30,$inp));
-+ &movups (&QWP(0x40,$out),$inout4);
-+ &movdqu ($inout4,&QWP(0x40,$inp));
-+ &movups (&QWP(0x50,$out),$inout5);
-+ &lea ($out,&DWP(0x60,$out));
-+ &movdqu ($inout5,&QWP(0x50,$inp));
-+ &lea ($inp,&DWP(0x60,$inp));
-+&set_label("ecb_dec_loop6_enter");
++ sub \$0x80,$len
++ jnc .Lecb_enc_loop8
+
-+ &call ("_aesni_decrypt6");
-
--&set_label("ecb_dec_loop3",16);
-- &movups ($inout0,&QWP(0,$inp));
-- &movups ($inout1,&QWP(0x10,$inp));
-- &movups ($inout2,&QWP(0x20,$inp));
-- &call ("_aesni_decrypt3");
-- &sub ($len,0x30);
-- &lea ($inp,&DWP(0x30,$inp));
-- &lea ($out,&DWP(0x30,$out));
-- &movups (&QWP(-0x30,$out),$inout0);
- &mov ($key,$key_); # restore $key
-- &movups (&QWP(-0x20,$out),$inout1);
- &mov ($rounds,$rounds_); # restore $rounds
-- &movups (&QWP(-0x10,$out),$inout2);
-- &ja (&label("ecb_dec_loop3"));
-+ &sub ($len,0x60);
-+ &jnc (&label("ecb_dec_loop6"));
-
--&set_label("ecb_dec_tail");
-- &add ($len,0x40);
-+ &movups (&QWP(0,$out),$inout0);
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &movups (&QWP(0x30,$out),$inout3);
-+ &movups (&QWP(0x40,$out),$inout4);
-+ &movups (&QWP(0x50,$out),$inout5);
-+ &lea ($out,&DWP(0x60,$out));
-+ &add ($len,0x60);
- &jz (&label("ecb_ret"));
-
-- &cmp ($len,0x10);
-+&set_label("ecb_dec_tail");
- &movups ($inout0,&QWP(0,$inp));
-- &je (&label("ecb_dec_one"));
- &cmp ($len,0x20);
-+ &jb (&label("ecb_dec_one"));
- &movups ($inout1,&QWP(0x10,$inp));
- &je (&label("ecb_dec_two"));
-- &cmp ($len,0x30);
- &movups ($inout2,&QWP(0x20,$inp));
-- &je (&label("ecb_dec_three"));
-+ &cmp ($len,0x40);
-+ &jb (&label("ecb_dec_three"));
- &movups ($inout3,&QWP(0x30,$inp));
-- &call ("_aesni_decrypt4");
-+ &je (&label("ecb_dec_four"));
-+ &movups ($inout4,&QWP(0x40,$inp));
-+ &xorps ($inout5,$inout5);
-+ &call ("_aesni_decrypt6");
- &movups (&QWP(0,$out),$inout0);
- &movups (&QWP(0x10,$out),$inout1);
- &movups (&QWP(0x20,$out),$inout2);
- &movups (&QWP(0x30,$out),$inout3);
-+ &movups (&QWP(0x40,$out),$inout4);
- &jmp (&label("ecb_ret"));
-
- &set_label("ecb_dec_one",16);
-- &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
-+ if ($inline)
-+ { &aesni_inline_generate1("dec"); }
-+ else
-+ { &call ("_aesni_decrypt1"); }
- &movups (&QWP(0,$out),$inout0);
- &jmp (&label("ecb_ret"));
++ movups $inout0,($out)
++ mov $key_,$key # restore $key
++ movups $inout1,0x10($out)
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ movups $inout5,0x50($out)
++ movups $inout6,0x60($out)
++ movups $inout7,0x70($out)
++ lea 0x80($out),$out
++ add \$0x80,$len
+ jz .Lecb_ret
- &set_label("ecb_dec_two",16);
-+ &xorps ($inout2,$inout2);
- &call ("_aesni_decrypt3");
- &movups (&QWP(0,$out),$inout0);
- &movups (&QWP(0x10,$out),$inout1);
-@@ -357,28 +553,42 @@ if ($PREFIX eq "aesni") {
- &movups (&QWP(0,$out),$inout0);
- &movups (&QWP(0x10,$out),$inout1);
- &movups (&QWP(0x20,$out),$inout2);
-+ &jmp (&label("ecb_ret"));
+- cmp \$0x10,$len
++.Lecb_enc_tail:
+ movups ($inp),$inout0
+- je .Lecb_enc_one
+ cmp \$0x20,$len
++ jb .Lecb_enc_one
+ movups 0x10($inp),$inout1
+ je .Lecb_enc_two
+- cmp \$0x30,$len
+ movups 0x20($inp),$inout2
+- je .Lecb_enc_three
++ cmp \$0x40,$len
++ jb .Lecb_enc_three
+ movups 0x30($inp),$inout3
+- call _aesni_encrypt4
++ je .Lecb_enc_four
++ movups 0x40($inp),$inout4
++ cmp \$0x60,$len
++ jb .Lecb_enc_five
++ movups 0x50($inp),$inout5
++ je .Lecb_enc_six
++ movdqu 0x60($inp),$inout6
++ call _aesni_encrypt8
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ movups $inout5,0x50($out)
++ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+ .align 16
+ .Lecb_enc_one:
+@@ -280,6 +620,7 @@ $code.=<<___;
+ jmp .Lecb_ret
+ .align 16
+ .Lecb_enc_two:
++ xorps $inout2,$inout2
+ call _aesni_encrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+@@ -291,47 +632,121 @@ $code.=<<___;
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ jmp .Lecb_ret
++.align 16
++.Lecb_enc_four:
++ call _aesni_encrypt4
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_enc_five:
++ xorps $inout5,$inout5
++ call _aesni_encrypt6
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_enc_six:
++ call _aesni_encrypt6
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ movups $inout5,0x50($out)
++ jmp .Lecb_ret
+ #--------------------------- ECB DECRYPT ------------------------------#
+ .align 16
+ .Lecb_decrypt:
+- sub \$0x40,$len
+- jbe .Lecb_dec_tail
+- jmp .Lecb_dec_loop3
++ cmp \$0x80,$len
++ jb .Lecb_dec_tail
+
-+&set_label("ecb_dec_four",16);
-+ &call ("_aesni_decrypt4");
-+ &movups (&QWP(0,$out),$inout0);
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &movups (&QWP(0x30,$out),$inout3);
-
- &set_label("ecb_ret");
- &function_end("aesni_ecb_encrypt");
- }
++ movdqu ($inp),$inout0
++ movdqu 0x10($inp),$inout1
++ movdqu 0x20($inp),$inout2
++ movdqu 0x30($inp),$inout3
++ movdqu 0x40($inp),$inout4
++ movdqu 0x50($inp),$inout5
++ movdqu 0x60($inp),$inout6
++ movdqu 0x70($inp),$inout7
++ lea 0x80($inp),$inp
++ sub \$0x80,$len
++ jmp .Lecb_dec_loop8_enter
+ .align 16
+-.Lecb_dec_loop3:
+- movups ($inp),$inout0
+- movups 0x10($inp),$inout1
+- movups 0x20($inp),$inout2
+- call _aesni_decrypt3
+- sub \$0x30,$len
+- lea 0x30($inp),$inp
+- lea 0x30($out),$out
+- movups $inout0,-0x30($out)
+- mov $rnds_,$rounds # restore $rounds
+- movups $inout1,-0x20($out)
++.Lecb_dec_loop8:
++ movups $inout0,($out)
+ mov $key_,$key # restore $key
+- movups $inout2,-0x10($out)
+- ja .Lecb_dec_loop3
++ movdqu ($inp),$inout0
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout1,0x10($out)
++ movdqu 0x10($inp),$inout1
++ movups $inout2,0x20($out)
++ movdqu 0x20($inp),$inout2
++ movups $inout3,0x30($out)
++ movdqu 0x30($inp),$inout3
++ movups $inout4,0x40($out)
++ movdqu 0x40($inp),$inout4
++ movups $inout5,0x50($out)
++ movdqu 0x50($inp),$inout5
++ movups $inout6,0x60($out)
++ movdqu 0x60($inp),$inout6
++ movups $inout7,0x70($out)
++ lea 0x80($out),$out
++ movdqu 0x70($inp),$inout7
++ lea 0x80($inp),$inp
++.Lecb_dec_loop8_enter:
++
++ call _aesni_decrypt8
++
++ $movkey ($key_),$rndkey0
++ sub \$0x80,$len
++ jnc .Lecb_dec_loop8
-+######################################################################
- # void $PREFIX_cbc_encrypt (const void *inp, void *out,
- # size_t length, const AES_KEY *key,
- # unsigned char *ivp,const int enc);
- &function_begin("${PREFIX}_cbc_encrypt");
- &mov ($inp,&wparam(0));
-+ &mov ($rounds_,"esp");
- &mov ($out,&wparam(1));
-+ &sub ($rounds_,24);
- &mov ($len,&wparam(2));
-+ &and ($rounds_,-16);
- &mov ($key,&wparam(3));
-- &test ($len,$len);
- &mov ($key_,&wparam(4));
-- &jz (&label("cbc_ret"));
-+ &test ($len,$len);
-+ &jz (&label("cbc_abort"));
+-.Lecb_dec_tail:
+- add \$0x40,$len
++ movups $inout0,($out)
++ mov $key_,$key # restore $key
++ movups $inout1,0x10($out)
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ movups $inout5,0x50($out)
++ movups $inout6,0x60($out)
++ movups $inout7,0x70($out)
++ lea 0x80($out),$out
++ add \$0x80,$len
+ jz .Lecb_ret
- &cmp (&wparam(5),0);
-- &movups ($ivec,&QWP(0,$key_)); # load IV
-+ &xchg ($rounds_,"esp"); # alloca
-+ &movups ($ivec,&QWP(0,$key_)); # load IV
- &mov ($rounds,&DWP(240,$key));
-- &mov ($key_,$key); # backup $key
-- &mov ($rounds_,$rounds); # backup $rounds
-+ &mov ($key_,$key); # backup $key
-+ &mov (&DWP(16,"esp"),$rounds_); # save original %esp
-+ &mov ($rounds_,$rounds); # backup $rounds
- &je (&label("cbc_decrypt"));
+- cmp \$0x10,$len
++.Lecb_dec_tail:
+ movups ($inp),$inout0
+- je .Lecb_dec_one
+ cmp \$0x20,$len
++ jb .Lecb_dec_one
+ movups 0x10($inp),$inout1
+ je .Lecb_dec_two
+- cmp \$0x30,$len
+ movups 0x20($inp),$inout2
+- je .Lecb_dec_three
++ cmp \$0x40,$len
++ jb .Lecb_dec_three
+ movups 0x30($inp),$inout3
+- call _aesni_decrypt4
++ je .Lecb_dec_four
++ movups 0x40($inp),$inout4
++ cmp \$0x60,$len
++ jb .Lecb_dec_five
++ movups 0x50($inp),$inout5
++ je .Lecb_dec_six
++ movups 0x60($inp),$inout6
++ $movkey ($key),$rndkey0
++ call _aesni_decrypt8
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
+ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ movups $inout5,0x50($out)
++ movups $inout6,0x60($out)
+ jmp .Lecb_ret
+ .align 16
+ .Lecb_dec_one:
+@@ -342,6 +757,7 @@ $code.=<<___;
+ jmp .Lecb_ret
+ .align 16
+ .Lecb_dec_two:
++ xorps $inout2,$inout2
+ call _aesni_decrypt3
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+@@ -352,6 +768,34 @@ $code.=<<___;
+ movups $inout0,($out)
+ movups $inout1,0x10($out)
+ movups $inout2,0x20($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_dec_four:
++ call _aesni_decrypt4
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_dec_five:
++ xorps $inout5,$inout5
++ call _aesni_decrypt6
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ jmp .Lecb_ret
++.align 16
++.Lecb_dec_six:
++ call _aesni_decrypt6
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ movups $inout5,0x50($out)
- &movaps ($inout0,$ivec);
-@@ -388,15 +598,17 @@ if ($PREFIX eq "aesni") {
- &jmp (&label("cbc_enc_loop"));
+ .Lecb_ret:
+ ret
+@@ -362,7 +806,8 @@ ___
+ # void $PREFIX_cbc_encrypt (const void *inp, void *out,
+ # size_t length, const AES_KEY *key,
+ # unsigned char *ivp,const int enc);
+-$reserved = $win64?0x40:-0x18; # used in decrypt
++{
++my $reserved = $win64?0x40:-0x18; # used in decrypt
+ $code.=<<___;
+ .globl ${PREFIX}_cbc_encrypt
+ .type ${PREFIX}_cbc_encrypt,\@function,6
+@@ -371,30 +816,30 @@ ${PREFIX}_cbc_encrypt:
+ test $len,$len # check length
+ jz .Lcbc_ret
- &set_label("cbc_enc_loop",16);
-- &movups ($ivec,&QWP(0,$inp));
-+ &movups ($ivec,&QWP(0,$inp)); # input actually
- &lea ($inp,&DWP(16,$inp));
-- &pxor ($inout0,$ivec);
-- &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3");
-- &sub ($len,16);
-- &lea ($out,&DWP(16,$out));
-+ if ($inline)
-+ { &aesni_inline_generate1("enc",$inout0,$ivec); }
-+ else
-+ { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
- &mov ($rounds,$rounds_); # restore $rounds
- &mov ($key,$key_); # restore $key
-- &movups (&QWP(-16,$out),$inout0);
-+ &movups (&QWP(0,$out),$inout0); # store output
-+ &lea ($out,&DWP(16,$out));
-+ &sub ($len,16);
- &jnc (&label("cbc_enc_loop"));
- &add ($len,16);
- &jnz (&label("cbc_enc_tail"));
-@@ -415,90 +627,151 @@ if ($PREFIX eq "aesni") {
- &mov ($inp,$out); # $inp and $out are the same
- &mov ($key,$key_); # restore $key
- &jmp (&label("cbc_enc_loop"));
--
-+######################################################################
- &set_label("cbc_decrypt",16);
-- &sub ($len,0x40);
-+ &cmp ($len,0x50);
- &jbe (&label("cbc_dec_tail"));
-- &jmp (&label("cbc_dec_loop3"));
-+ &movaps (&QWP(0,"esp"),$ivec); # save IV
-+ &sub ($len,0x50);
-+ &jmp (&label("cbc_dec_loop6_enter"));
-+
-+&set_label("cbc_dec_loop6",16);
-+ &movaps (&QWP(0,"esp"),$rndkey0); # save IV
-+ &movups (&QWP(0,$out),$inout5);
-+ &lea ($out,&DWP(0x10,$out));
-+&set_label("cbc_dec_loop6_enter");
-+ &movdqu ($inout0,&QWP(0,$inp));
-+ &movdqu ($inout1,&QWP(0x10,$inp));
-+ &movdqu ($inout2,&QWP(0x20,$inp));
-+ &movdqu ($inout3,&QWP(0x30,$inp));
-+ &movdqu ($inout4,&QWP(0x40,$inp));
-+ &movdqu ($inout5,&QWP(0x50,$inp));
+- mov 240($key),$rnds_ # pull $rounds
++ mov 240($key),$rnds_ # key->rounds
+ mov $key,$key_ # backup $key
+ test %r9d,%r9d # 6th argument
+ jz .Lcbc_decrypt
+ #--------------------------- CBC ENCRYPT ------------------------------#
+ movups ($ivp),$inout0 # load iv as initial state
+- cmp \$16,$len
+ mov $rnds_,$rounds
++ cmp \$16,$len
+ jb .Lcbc_enc_tail
+ sub \$16,$len
+ jmp .Lcbc_enc_loop
+-.align 16
++.align 16
+ .Lcbc_enc_loop:
+ movups ($inp),$inout1 # load input
+ lea 16($inp),$inp
+- pxor $inout1,$inout0
++ #xorps $inout1,$inout0
+ ___
+- &aesni_generate1("enc",$key,$rounds);
++ &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
+ $code.=<<___;
+- sub \$16,$len
+- lea 16($out),$out
+ mov $rnds_,$rounds # restore $rounds
+ mov $key_,$key # restore $key
+- movups $inout0,-16($out) # store output
++ movups $inout0,0($out) # store output
++ lea 16($out),$out
++ sub \$16,$len
+ jnc .Lcbc_enc_loop
+ add \$16,$len
+ jnz .Lcbc_enc_tail
+@@ -429,92 +874,238 @@ $code.=<<___ if ($win64);
+ ___
+ $code.=<<___;
+ movups ($ivp),$iv
+- sub \$0x40,$len
+ mov $rnds_,$rounds
++ cmp \$0x70,$len
+ jbe .Lcbc_dec_tail
+- jmp .Lcbc_dec_loop3
+-.align 16
+-.Lcbc_dec_loop3:
+- movups ($inp),$inout0
++ shr \$1,$rnds_
++ sub \$0x70,$len
++ mov $rnds_,$rounds
++ movaps $iv,$reserved(%rsp)
++ jmp .Lcbc_dec_loop8_enter
++.align 16
++.Lcbc_dec_loop8:
++ movaps $rndkey0,$reserved(%rsp) # save IV
++ movups $inout7,($out)
++ lea 0x10($out),$out
++.Lcbc_dec_loop8_enter:
++ $movkey ($key),$rndkey0
++ movups ($inp),$inout0 # load input
+ movups 0x10($inp),$inout1
+- movups 0x20($inp),$inout2
+- movaps $inout0,$in0
+- movaps $inout1,$in1
+- movaps $inout2,$in2
+- call _aesni_decrypt3
+- sub \$0x30,$len
+- lea 0x30($inp),$inp
+- lea 0x30($out),$out
+- pxor $iv,$inout0
+- pxor $in0,$inout1
+- movaps $in2,$iv
+- pxor $in1,$inout2
+- movups $inout0,-0x30($out)
+- mov $rnds_,$rounds # restore $rounds
+- movups $inout1,-0x20($out)
+- mov $key_,$key # restore $key
+- movups $inout2,-0x10($out)
+- ja .Lcbc_dec_loop3
++ $movkey 16($key),$rndkey1
--&set_label("cbc_dec_loop3",16);
-- &movups ($inout0,&QWP(0,$inp));
-- &movups ($inout1,&QWP(0x10,$inp));
-- &movups ($inout2,&QWP(0x20,$inp));
-- &movaps ($in0,$inout0);
-- &movaps ($in1,$inout1);
-- &call ("_aesni_decrypt3");
-- &sub ($len,0x30);
-- &lea ($inp,&DWP(0x30,$inp));
-- &lea ($out,&DWP(0x30,$out));
-- &pxor ($inout0,$ivec);
-- &pxor ($inout1,$in0);
-- &movups ($ivec,&QWP(-0x10,$inp));
-- &pxor ($inout2,$in1);
-- &movups (&QWP(-0x30,$out),$inout0);
-- &mov ($rounds,$rounds_) # restore $rounds
-- &movups (&QWP(-0x20,$out),$inout1);
-- &mov ($key,$key_); # restore $key
-- &movups (&QWP(-0x10,$out),$inout2);
-- &ja (&label("cbc_dec_loop3"));
-+ &call ("_aesni_decrypt6");
+-.Lcbc_dec_tail:
+- add \$0x40,$len
+- movups $iv,($ivp)
+- jz .Lcbc_dec_ret
++ lea 32($key),$key
++ movdqu 0x20($inp),$inout2
++ xorps $rndkey0,$inout0
++ movdqu 0x30($inp),$inout3
++ xorps $rndkey0,$inout1
++ movdqu 0x40($inp),$inout4
++ aesdec $rndkey1,$inout0
++ pxor $rndkey0,$inout2
++ movdqu 0x50($inp),$inout5
++ aesdec $rndkey1,$inout1
++ pxor $rndkey0,$inout3
++ movdqu 0x60($inp),$inout6
++ aesdec $rndkey1,$inout2
++ pxor $rndkey0,$inout4
++ movdqu 0x70($inp),$inout7
++ aesdec $rndkey1,$inout3
++ pxor $rndkey0,$inout5
++ dec $rounds
++ aesdec $rndkey1,$inout4
++ pxor $rndkey0,$inout6
++ aesdec $rndkey1,$inout5
++ pxor $rndkey0,$inout7
++ $movkey ($key),$rndkey0
++ aesdec $rndkey1,$inout6
++ aesdec $rndkey1,$inout7
++ $movkey 16($key),$rndkey1
++
++ call .Ldec_loop8_enter
-+ &movups ($rndkey1,&QWP(0,$inp));
-+ &movups ($rndkey0,&QWP(0x10,$inp));
-+ &xorps ($inout0,&QWP(0,"esp")); # ^=IV
-+ &xorps ($inout1,$rndkey1);
-+ &movups ($rndkey1,&QWP(0x20,$inp));
-+ &xorps ($inout2,$rndkey0);
-+ &movups ($rndkey0,&QWP(0x30,$inp));
-+ &xorps ($inout3,$rndkey1);
-+ &movups ($rndkey1,&QWP(0x40,$inp));
-+ &xorps ($inout4,$rndkey0);
-+ &movups ($rndkey0,&QWP(0x50,$inp)); # IV
-+ &xorps ($inout5,$rndkey1);
-+ &movups (&QWP(0,$out),$inout0);
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &lea ($inp,&DWP(0x60,$inp));
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &mov ($rounds,$rounds_) # restore $rounds
-+ &movups (&QWP(0x30,$out),$inout3);
-+ &mov ($key,$key_); # restore $key
-+ &movups (&QWP(0x40,$out),$inout4);
-+ &lea ($out,&DWP(0x50,$out));
-+ &sub ($len,0x60);
-+ &ja (&label("cbc_dec_loop6"));
++ movups ($inp),$rndkey1 # re-load input
++ movups 0x10($inp),$rndkey0
++ xorps $reserved(%rsp),$inout0 # ^= IV
++ xorps $rndkey1,$inout1
++ movups 0x20($inp),$rndkey1
++ xorps $rndkey0,$inout2
++ movups 0x30($inp),$rndkey0
++ xorps $rndkey1,$inout3
++ movups 0x40($inp),$rndkey1
++ xorps $rndkey0,$inout4
++ movups 0x50($inp),$rndkey0
++ xorps $rndkey1,$inout5
++ movups 0x60($inp),$rndkey1
++ xorps $rndkey0,$inout6
++ movups 0x70($inp),$rndkey0 # IV
++ xorps $rndkey1,$inout7
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ mov $rnds_,$rounds # restore $rounds
++ movups $inout4,0x40($out)
++ mov $key_,$key # restore $key
++ movups $inout5,0x50($out)
++ lea 0x80($inp),$inp
++ movups $inout6,0x60($out)
++ lea 0x70($out),$out
++ sub \$0x80,$len
++ ja .Lcbc_dec_loop8
++
++ movaps $inout7,$inout0
++ movaps $rndkey0,$iv
++ add \$0x70,$len
++ jle .Lcbc_dec_tail_collected
++ movups $inout0,($out)
++ lea 1($rnds_,$rnds_),$rounds
++ lea 0x10($out),$out
++.Lcbc_dec_tail:
+ movups ($inp),$inout0
+- cmp \$0x10,$len
+ movaps $inout0,$in0
++ cmp \$0x10,$len
+ jbe .Lcbc_dec_one
+
-+ &movaps ($inout0,$inout5);
-+ &movaps ($ivec,$rndkey0);
-+ &add ($len,0x50);
-+ &jle (&label("cbc_dec_tail_collected"));
-+ &movups (&QWP(0,$out),$inout0);
-+ &lea ($out,&DWP(0x10,$out));
- &set_label("cbc_dec_tail");
-- &add ($len,0x40);
-- &jz (&label("cbc_ret"));
--
- &movups ($inout0,&QWP(0,$inp));
-- &cmp ($len,0x10);
- &movaps ($in0,$inout0);
-+ &cmp ($len,0x10);
- &jbe (&label("cbc_dec_one"));
+ movups 0x10($inp),$inout1
+- cmp \$0x20,$len
+ movaps $inout1,$in1
++ cmp \$0x20,$len
+ jbe .Lcbc_dec_two
+
- &movups ($inout1,&QWP(0x10,$inp));
-- &cmp ($len,0x20);
- &movaps ($in1,$inout1);
-+ &cmp ($len,0x20);
- &jbe (&label("cbc_dec_two"));
+ movups 0x20($inp),$inout2
+- cmp \$0x30,$len
+ movaps $inout2,$in2
++ cmp \$0x30,$len
+ jbe .Lcbc_dec_three
+
- &movups ($inout2,&QWP(0x20,$inp));
- &cmp ($len,0x30);
- &jbe (&label("cbc_dec_three"));
+ movups 0x30($inp),$inout3
+- call _aesni_decrypt4
+- pxor $iv,$inout0
+- movups 0x30($inp),$iv
+- pxor $in0,$inout1
++ cmp \$0x40,$len
++ jbe .Lcbc_dec_four
+
- &movups ($inout3,&QWP(0x30,$inp));
-- &call ("_aesni_decrypt4");
-+ &cmp ($len,0x40);
-+ &jbe (&label("cbc_dec_four"));
++ movups 0x40($inp),$inout4
++ cmp \$0x50,$len
++ jbe .Lcbc_dec_five
+
-+ &movups ($inout4,&QWP(0x40,$inp));
-+ &movaps (&QWP(0,"esp"),$ivec); # save IV
-+ &movups ($inout0,&QWP(0,$inp));
-+ &xorps ($inout5,$inout5);
-+ &call ("_aesni_decrypt6");
-+ &movups ($rndkey1,&QWP(0,$inp));
- &movups ($rndkey0,&QWP(0x10,$inp));
-+ &xorps ($inout0,&QWP(0,"esp")); # ^= IV
-+ &xorps ($inout1,$rndkey1);
- &movups ($rndkey1,&QWP(0x20,$inp));
-- &pxor ($inout0,$ivec);
-- &pxor ($inout1,$in0);
-- &movups ($ivec,&QWP(0x30,$inp));
-+ &xorps ($inout2,$rndkey0);
-+ &movups ($rndkey0,&QWP(0x30,$inp));
-+ &xorps ($inout3,$rndkey1);
-+ &movups ($ivec,&QWP(0x40,$inp)); # IV
-+ &xorps ($inout4,$rndkey0);
- &movups (&QWP(0,$out),$inout0);
-- &pxor ($inout2,$rndkey0);
-- &pxor ($inout3,$rndkey1);
- &movups (&QWP(0x10,$out),$inout1);
- &movups (&QWP(0x20,$out),$inout2);
-- &movaps ($inout0,$inout3);
-- &lea ($out,&DWP(0x30,$out));
-+ &movups (&QWP(0x30,$out),$inout3);
-+ &lea ($out,&DWP(0x40,$out));
-+ &movaps ($inout0,$inout4);
-+ &sub ($len,0x50);
- &jmp (&label("cbc_dec_tail_collected"));
-
--&set_label("cbc_dec_one");
-- &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
-- &pxor ($inout0,$ivec);
-+&set_label("cbc_dec_one",16);
-+ if ($inline)
-+ { &aesni_inline_generate1("dec"); }
-+ else
-+ { &call ("_aesni_decrypt1"); }
-+ &xorps ($inout0,$ivec);
- &movaps ($ivec,$in0);
-+ &sub ($len,0x10);
- &jmp (&label("cbc_dec_tail_collected"));
-
--&set_label("cbc_dec_two");
-+&set_label("cbc_dec_two",16);
-+ &xorps ($inout2,$inout2);
- &call ("_aesni_decrypt3");
-- &pxor ($inout0,$ivec);
-- &pxor ($inout1,$in0);
-+ &xorps ($inout0,$ivec);
-+ &xorps ($inout1,$in0);
- &movups (&QWP(0,$out),$inout0);
- &movaps ($inout0,$inout1);
-- &movaps ($ivec,$in1);
- &lea ($out,&DWP(0x10,$out));
-+ &movaps ($ivec,$in1);
-+ &sub ($len,0x20);
- &jmp (&label("cbc_dec_tail_collected"));
-
--&set_label("cbc_dec_three");
-+&set_label("cbc_dec_three",16);
- &call ("_aesni_decrypt3");
-- &pxor ($inout0,$ivec);
-- &pxor ($inout1,$in0);
-- &pxor ($inout2,$in1);
-+ &xorps ($inout0,$ivec);
-+ &xorps ($inout1,$in0);
-+ &xorps ($inout2,$in1);
- &movups (&QWP(0,$out),$inout0);
-- &movups (&QWP(0x10,$out),$inout1);
- &movaps ($inout0,$inout2);
-- &movups ($ivec,&QWP(0x20,$inp));
-+ &movups (&QWP(0x10,$out),$inout1);
- &lea ($out,&DWP(0x20,$out));
-+ &movups ($ivec,&QWP(0x20,$inp));
-+ &sub ($len,0x30);
-+ &jmp (&label("cbc_dec_tail_collected"));
++ movups 0x50($inp),$inout5
++ cmp \$0x60,$len
++ jbe .Lcbc_dec_six
+
-+&set_label("cbc_dec_four",16);
-+ &call ("_aesni_decrypt4");
-+ &movups ($rndkey1,&QWP(0x10,$inp));
-+ &movups ($rndkey0,&QWP(0x20,$inp));
-+ &xorps ($inout0,$ivec);
-+ &movups ($ivec,&QWP(0x30,$inp));
-+ &xorps ($inout1,$in0);
-+ &movups (&QWP(0,$out),$inout0);
-+ &xorps ($inout2,$rndkey1);
-+ &movups (&QWP(0x10,$out),$inout1);
-+ &xorps ($inout3,$rndkey0);
-+ &movups (&QWP(0x20,$out),$inout2);
-+ &lea ($out,&DWP(0x30,$out));
-+ &movaps ($inout0,$inout3);
-+ &sub ($len,0x40);
-
- &set_label("cbc_dec_tail_collected");
- &and ($len,15);
-@@ -506,21 +779,21 @@ if ($PREFIX eq "aesni") {
- &movups (&QWP(0,$out),$inout0);
- &jmp (&label("cbc_ret"));
-
--&set_label("cbc_dec_tail_partial");
-- &mov ($key_,"esp");
-- &sub ("esp",16);
-- &and ("esp",-16);
-+&set_label("cbc_dec_tail_partial",16);
- &movaps (&QWP(0,"esp"),$inout0);
-+ &mov ("ecx",16);
- &mov ($inp,"esp");
-- &mov ("ecx",$len);
-+ &sub ("ecx",$len);
- &data_word(0xA4F3F689); # rep movsb
-- &mov ("esp",$key_);
-
- &set_label("cbc_ret");
-+ &mov ("esp",&DWP(16,"esp")); # pull original %esp
- &mov ($key_,&wparam(4));
- &movups (&QWP(0,$key_),$ivec); # output IV
-+&set_label("cbc_abort");
- &function_end("${PREFIX}_cbc_encrypt");
--
-+
-+######################################################################
- # Mechanical port from aesni-x86_64.pl.
- #
- # _aesni_set_encrypt_key is private interface,
-@@ -539,7 +812,7 @@ if ($PREFIX eq "aesni") {
- &jz (&label("bad_pointer"));
++ movups 0x60($inp),$inout6
++ movaps $iv,$reserved(%rsp) # save IV
++ call _aesni_decrypt8
++ movups ($inp),$rndkey1
++ movups 0x10($inp),$rndkey0
++ xorps $reserved(%rsp),$inout0 # ^= IV
++ xorps $rndkey1,$inout1
++ movups 0x20($inp),$rndkey1
++ xorps $rndkey0,$inout2
++ movups 0x30($inp),$rndkey0
++ xorps $rndkey1,$inout3
++ movups 0x40($inp),$rndkey1
++ xorps $rndkey0,$inout4
++ movups 0x50($inp),$rndkey0
++ xorps $rndkey1,$inout5
++ movups 0x60($inp),$iv # IV
++ xorps $rndkey0,$inout6
+ movups $inout0,($out)
+- pxor $in1,$inout2
+ movups $inout1,0x10($out)
+- pxor $in2,$inout3
+ movups $inout2,0x20($out)
+- movaps $inout3,$inout0
+- lea 0x30($out),$out
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ movups $inout5,0x50($out)
++ lea 0x60($out),$out
++ movaps $inout6,$inout0
++ sub \$0x70,$len
+ jmp .Lcbc_dec_tail_collected
+ .align 16
+ .Lcbc_dec_one:
+ ___
+ &aesni_generate1("dec",$key,$rounds);
+ $code.=<<___;
+- pxor $iv,$inout0
++ xorps $iv,$inout0
+ movaps $in0,$iv
++ sub \$0x10,$len
+ jmp .Lcbc_dec_tail_collected
+ .align 16
+ .Lcbc_dec_two:
++ xorps $inout2,$inout2
+ call _aesni_decrypt3
+- pxor $iv,$inout0
+- pxor $in0,$inout1
++ xorps $iv,$inout0
++ xorps $in0,$inout1
+ movups $inout0,($out)
+ movaps $in1,$iv
+ movaps $inout1,$inout0
+ lea 0x10($out),$out
++ sub \$0x20,$len
+ jmp .Lcbc_dec_tail_collected
+ .align 16
+ .Lcbc_dec_three:
+ call _aesni_decrypt3
+- pxor $iv,$inout0
+- pxor $in0,$inout1
++ xorps $iv,$inout0
++ xorps $in0,$inout1
+ movups $inout0,($out)
+- pxor $in1,$inout2
++ xorps $in1,$inout2
+ movups $inout1,0x10($out)
+ movaps $in2,$iv
+ movaps $inout2,$inout0
+ lea 0x20($out),$out
++ sub \$0x30,$len
++ jmp .Lcbc_dec_tail_collected
++.align 16
++.Lcbc_dec_four:
++ call _aesni_decrypt4
++ xorps $iv,$inout0
++ movups 0x30($inp),$iv
++ xorps $in0,$inout1
++ movups $inout0,($out)
++ xorps $in1,$inout2
++ movups $inout1,0x10($out)
++ xorps $in2,$inout3
++ movups $inout2,0x20($out)
++ movaps $inout3,$inout0
++ lea 0x30($out),$out
++ sub \$0x40,$len
++ jmp .Lcbc_dec_tail_collected
++.align 16
++.Lcbc_dec_five:
++ xorps $inout5,$inout5
++ call _aesni_decrypt6
++ movups 0x10($inp),$rndkey1
++ movups 0x20($inp),$rndkey0
++ xorps $iv,$inout0
++ xorps $in0,$inout1
++ xorps $rndkey1,$inout2
++ movups 0x30($inp),$rndkey1
++ xorps $rndkey0,$inout3
++ movups 0x40($inp),$iv
++ xorps $rndkey1,$inout4
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ lea 0x40($out),$out
++ movaps $inout4,$inout0
++ sub \$0x50,$len
++ jmp .Lcbc_dec_tail_collected
++.align 16
++.Lcbc_dec_six:
++ call _aesni_decrypt6
++ movups 0x10($inp),$rndkey1
++ movups 0x20($inp),$rndkey0
++ xorps $iv,$inout0
++ xorps $in0,$inout1
++ xorps $rndkey1,$inout2
++ movups 0x30($inp),$rndkey1
++ xorps $rndkey0,$inout3
++ movups 0x40($inp),$rndkey0
++ xorps $rndkey1,$inout4
++ movups 0x50($inp),$iv
++ xorps $rndkey0,$inout5
++ movups $inout0,($out)
++ movups $inout1,0x10($out)
++ movups $inout2,0x20($out)
++ movups $inout3,0x30($out)
++ movups $inout4,0x40($out)
++ lea 0x50($out),$out
++ movaps $inout5,$inout0
++ sub \$0x60,$len
+ jmp .Lcbc_dec_tail_collected
+ .align 16
+ .Lcbc_dec_tail_collected:
+@@ -523,10 +1114,12 @@ $code.=<<___;
+ jnz .Lcbc_dec_tail_partial
+ movups $inout0,($out)
+ jmp .Lcbc_dec_ret
++.align 16
+ .Lcbc_dec_tail_partial:
+ movaps $inout0,$reserved(%rsp)
++ mov \$16,%rcx
+ mov $out,%rdi
+- mov $len,%rcx
++ sub $len,%rcx
+ lea $reserved(%rsp),%rsi
+ .long 0x9066A4F3 # rep movsb
- &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
-- &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
-+ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
- &lea ($key,&DWP(16,$key));
- &cmp ($rounds,256);
- &je (&label("14rounds"));
-@@ -581,11 +854,11 @@ if ($PREFIX eq "aesni") {
- &lea ($key,&DWP(16,$key));
- &set_label("key_128_cold");
- &shufps ("xmm4","xmm0",0b00010000);
-- &pxor ("xmm0","xmm4");
-- &shufps ("xmm4","xmm0",0b10001100,);
-- &pxor ("xmm0","xmm4");
-- &pshufd ("xmm1","xmm1",0b11111111); # critical path
-- &pxor ("xmm0","xmm1");
-+ &xorps ("xmm0","xmm4");
-+ &shufps ("xmm4","xmm0",0b10001100);
-+ &xorps ("xmm0","xmm4");
-+ &shufps ("xmm1","xmm1",0b11111111); # critical path
-+ &xorps ("xmm0","xmm1");
- &ret();
+@@ -544,7 +1137,7 @@ $code.=<<___;
+ ret
+ .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+ ___
+-
++}
+ # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+ # int bits, AES_KEY *key)
+ { my ($inp,$bits,$key) = @_4args;
+@@ -556,7 +1149,7 @@ $code.=<<___;
+ .align 16
+ ${PREFIX}_set_decrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+- call _aesni_set_encrypt_key
++ call __aesni_set_encrypt_key
+ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
+ test %eax,%eax
+ jnz .Ldec_key_ret
+@@ -576,9 +1169,9 @@ ${PREFIX}_set_decrypt_key:
+ aesimc %xmm1,%xmm1
+ lea 16($key),$key
+ lea -16($inp),$inp
+- cmp $key,$inp
+ $movkey %xmm0,16($inp)
+ $movkey %xmm1,-16($key)
++ cmp $key,$inp
+ ja .Ldec_key_inverse
- &set_label("12rounds",16);
-@@ -620,11 +893,11 @@ if ($PREFIX eq "aesni") {
- &movaps ("xmm5","xmm2");
- &set_label("key_192b_warm");
- &shufps ("xmm4","xmm0",0b00010000);
-- &movaps ("xmm3","xmm2");
-- &pxor ("xmm0","xmm4");
-+ &movdqa ("xmm3","xmm2");
-+ &xorps ("xmm0","xmm4");
- &shufps ("xmm4","xmm0",0b10001100);
- &pslldq ("xmm3",4);
-- &pxor ("xmm0","xmm4");
-+ &xorps ("xmm0","xmm4");
- &pshufd ("xmm1","xmm1",0b01010101); # critical path
- &pxor ("xmm2","xmm3");
- &pxor ("xmm0","xmm1");
-@@ -683,11 +956,11 @@ if ($PREFIX eq "aesni") {
- &lea ($key,&DWP(16,$key));
- &set_label("key_256a_cold");
- &shufps ("xmm4","xmm0",0b00010000);
-- &pxor ("xmm0","xmm4");
-+ &xorps ("xmm0","xmm4");
- &shufps ("xmm4","xmm0",0b10001100);
-- &pxor ("xmm0","xmm4");
-- &pshufd ("xmm1","xmm1",0b11111111); # critical path
-- &pxor ("xmm0","xmm1");
-+ &xorps ("xmm0","xmm4");
-+ &shufps ("xmm1","xmm1",0b11111111); # critical path
-+ &xorps ("xmm0","xmm1");
- &ret();
+ $movkey ($key),%xmm0 # inverse middle
+@@ -605,16 +1198,16 @@ $code.=<<___;
+ .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
+ .align 16
+ ${PREFIX}_set_encrypt_key:
+-_aesni_set_encrypt_key:
++__aesni_set_encrypt_key:
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+- test $inp,$inp
+ mov \$-1,%rax
++ test $inp,$inp
+ jz .Lenc_key_ret
+ test $key,$key
+ jz .Lenc_key_ret
- &set_label("key_256b",16);
-@@ -695,11 +968,11 @@ if ($PREFIX eq "aesni") {
- &lea ($key,&DWP(16,$key));
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
+- pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0
++ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
+ lea 16($key),%rax
+ cmp \$256,$bits
+ je .L14rounds
+@@ -729,11 +1322,11 @@ _aesni_set_encrypt_key:
+ lea 16(%rax),%rax
+ .Lkey_expansion_128_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+- pxor %xmm4, %xmm0
++ xorps %xmm4, %xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+- pxor %xmm4, %xmm0
+- pshufd \$0b11111111,%xmm1,%xmm1 # critical path
+- pxor %xmm1,%xmm0
++ xorps %xmm4, %xmm0
++ shufps \$0b11111111,%xmm1,%xmm1 # critical path
++ xorps %xmm1,%xmm0
+ ret
- &shufps ("xmm4","xmm2",0b00010000);
-- &pxor ("xmm2","xmm4");
-+ &xorps ("xmm2","xmm4");
- &shufps ("xmm4","xmm2",0b10001100);
-- &pxor ("xmm2","xmm4");
-- &pshufd ("xmm1","xmm1",0b10101010); # critical path
-- &pxor ("xmm2","xmm1");
-+ &xorps ("xmm2","xmm4");
-+ &shufps ("xmm1","xmm1",0b10101010); # critical path
-+ &xorps ("xmm2","xmm1");
- &ret();
+ .align 16
+@@ -744,11 +1337,11 @@ _aesni_set_encrypt_key:
+ movaps %xmm2, %xmm5
+ .Lkey_expansion_192b_warm:
+ shufps \$0b00010000,%xmm0,%xmm4
+- movaps %xmm2,%xmm3
+- pxor %xmm4,%xmm0
++ movdqa %xmm2,%xmm3
++ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+ pslldq \$4,%xmm3
+- pxor %xmm4,%xmm0
++ xorps %xmm4,%xmm0
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+@@ -772,11 +1365,11 @@ _aesni_set_encrypt_key:
+ lea 16(%rax),%rax
+ .Lkey_expansion_256a_cold:
+ shufps \$0b00010000,%xmm0,%xmm4
+- pxor %xmm4,%xmm0
++ xorps %xmm4,%xmm0
+ shufps \$0b10001100,%xmm0,%xmm4
+- pxor %xmm4,%xmm0
+- pshufd \$0b11111111,%xmm1,%xmm1 # critical path
+- pxor %xmm1,%xmm0
++ xorps %xmm4,%xmm0
++ shufps \$0b11111111,%xmm1,%xmm1 # critical path
++ xorps %xmm1,%xmm0
+ ret
- &set_label("bad_pointer",4);
-@@ -747,9 +1020,9 @@ if ($PREFIX eq "aesni") {
- &aesimc ("xmm1","xmm1");
- &lea ($key,&DWP(16,$key));
- &lea ("eax",&DWP(-16,"eax"));
-- &cmp ("eax",$key);
- &$movekey (&QWP(16,"eax"),"xmm0");
- &$movekey (&QWP(-16,$key),"xmm1");
-+ &cmp ("eax",$key);
- &ja (&label("dec_key_inverse"));
+ .align 16
+@@ -785,17 +1378,28 @@ _aesni_set_encrypt_key:
+ lea 16(%rax),%rax
- &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
-diff -up openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl.intelopts openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl
---- openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl.intelopts 2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/aes/asm/aesni-x86_64.pl 2011-08-24 12:50:56.000000000 +0200
+ shufps \$0b00010000,%xmm2,%xmm4
+- pxor %xmm4,%xmm2
++ xorps %xmm4,%xmm2
+ shufps \$0b10001100,%xmm2,%xmm4
+- pxor %xmm4,%xmm2
+- pshufd \$0b10101010,%xmm1,%xmm1 # critical path
+- pxor %xmm1,%xmm2
++ xorps %xmm4,%xmm2
++ shufps \$0b10101010,%xmm1,%xmm1 # critical path
++ xorps %xmm1,%xmm2
+ ret
+ .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
++.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+ ___
+ }
+
+ $code.=<<___;
++.align 64
++.Lbswap_mask:
++ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
++.Lincrement32:
++ .long 6,6,6,0
++.Lincrement64:
++ .long 1,0,0,0
++.Lxts_magic:
++ .long 0x87,0,1,0
++
+ .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 64
+ ___
+diff -up openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl.intelopts openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl
+--- openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl.intelopts 2013-02-19 21:15:39.390403182 +0100
++++ openssl-1.0.0k/crypto/aes/asm/aesni-x86.pl 2013-02-19 21:15:39.425403896 +0100
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/perl
- #
+
# ====================================================================
# Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -11,6 +11,145 @@
+@@ -11,10 +11,37 @@
# OpenSSL context it's used with Intel engine, but can also be used as
- # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
+ # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
# details].
+#
+# Performance.
+#
-+# Given aes(enc|dec) instructions' latency asymptotic performance for
-+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
-+# processed with 128-bit key. And given their throughput asymptotic
-+# performance for parallelizable modes is 1.25 cycles per byte. Being
-+# asymptotic limit it's not something you commonly achieve in reality,
-+# but how close does one get? Below are results collected for
-+# different modes and block sized. Pairs of numbers are for en-/
-+# decryption.
++# To start with see corresponding paragraph in aesni-x86_64.pl...
++# Instead of filling table similar to one found there I've chosen to
++# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
++# The simplified table below represents 32-bit performance relative
++# to 64-bit one in every given point. Ratios vary for different
++# encryption modes, therefore interval values.
+#
+# 16-byte 64-byte 256-byte 1-KB 8-KB
-+# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
-+# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
-+# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
-+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
-+# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
-+# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
-+#
-+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
-+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
-+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
-+# The results were collected with specially crafted speed.c benchmark
-+# in order to compare them with results reported in "Intel Advanced
-+# Encryption Standard (AES) New Instruction Set" White Paper Revision
-+# 3.0 dated May 2010. All above results are consistently better. This
-+# module also provides better performance for block sizes smaller than
-+# 128 bytes in points *not* represented in the above table.
-+#
-+# Looking at the results for 8-KB buffer.
-+#
-+# CFB and OFB results are far from the limit, because implementation
-+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
-+# single-block aesni_encrypt, which is not the most optimal way to go.
-+# CBC encrypt result is unexpectedly high and there is no documented
-+# explanation for it. Seemingly there is a small penalty for feeding
-+# the result back to AES unit the way it's done in CBC mode. There is
-+# nothing one can do and the result appears optimal. CCM result is
-+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
-+# saving output. CCM CTR "stays invisible," because it's neatly
-+# interleaved wih CBC-MAC. This provides ~30% improvement over
-+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
-+# disjointly. Parallelizable modes practically achieve the theoretical
-+# limit.
-+#
-+# Looking at how results vary with buffer size.
-+#
-+# Curves are practically saturated at 1-KB buffer size. In most cases
-+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
-+# CTR curve doesn't follow this pattern and is "slowest" changing one
-+# with "256-byte" result being 87% of "8-KB." This is because overhead
-+# in CTR mode is most computationally intensive. Small-block CCM
-+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
-+# iterations can't be interleaved.
-+#
-+# Results for 192- and 256-bit keys.
++# 53-67% 67-84% 91-94% 95-98% 97-99.5%
+#
-+# EVP-free results were observed to scale perfectly with number of
-+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
-+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
-+# are a tad smaller, because the above mentioned penalty biases all
-+# results by same constant value. In similar way function call
-+# overhead affects small-block performance, as well as OFB and CFB
-+# results. Differences are not large, most common coefficients are
-+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
-+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
++# Lower ratios for smaller block sizes are perfectly understandable,
++# because function call overhead is higher in 32-bit mode. Largest
++# 8-KB block performance is virtually same: 32-bit code is less than
++# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
-+# While Westmere processor features 6 cycles latency for aes[enc|dec]
-+# instructions, which can be scheduled every second cycle, Sandy
-+# Bridge spends 8 cycles per instruction, but it can schedule them
-+# every cycle. This means that code targeting Westmere would perform
-+# suboptimally on Sandy Bridge. Therefore this update.
-+#
-+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
-+# optimized. Relative improvement might appear modest, 8% on Westmere,
-+# but in absolute terms it's 3.77 cycles per byte encrypted with
-+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
-+# should be compared to asymptotic limits of 3.75 for Westmere and
-+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
-+# to asymptotic limits is quite amazing. Indeed, the limit is
-+# calculated as latency times number of rounds, 10 for 128-bit key,
-+# and divided by 16, the number of bytes in block, or in other words
-+# it accounts *solely* for aesenc instructions. But there are extra
-+# instructions, and numbers so close to the asymptotic limits mean
-+# that it's as if it takes as little as *one* additional cycle to
-+# execute all of them. How is it possible? It is possible thanks to
-+# out-of-order execution logic, which manages to overlap post-
-+# processing of previous block, things like saving the output, with
-+# actual encryption of current block, as well as pre-processing of
-+# current block, things like fetching input and xor-ing it with
-+# 0-round element of the key schedule, with actual encryption of
-+# previous block. Keep this in mind...
-+#
-+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
-+# performance is achieved by interleaving instructions working on
-+# independent blocks. In which case asymptotic limit for such modes
-+# can be obtained by dividing above mentioned numbers by AES
-+# instructions' interleave factor. Westmere can execute at most 3
-+# instructions at a time, meaning that optimal interleave factor is 3,
-+# and that's where the "magic" number of 1.25 come from. "Optimal
-+# interleave factor" means that increase of interleave factor does
-+# not improve performance. The formula has proven to reflect reality
-+# pretty well on Westmere... Sandy Bridge on the other hand can
-+# execute up to 8 AES instructions at a time, so how does varying
-+# interleave factor affect the performance? Here is table for ECB
-+# (numbers are cycles per byte processed with 128-bit key):
-+#
-+# instruction interleave factor 3x 6x 8x
-+# theoretical asymptotic limit 1.67 0.83 0.625
-+# measured performance for 8KB block 1.05 0.86 0.84
-+#
-+# "as if" interleave factor 4.7x 5.8x 6.0x
-+#
-+# Further data for other parallelizable modes:
-+#
-+# CBC decrypt 1.16 0.93 0.93
-+# CTR 1.14 0.91 n/a
-+#
-+# Well, given 3x column it's probably inappropriate to call the limit
-+# asymptotic, if it can be surpassed, isn't it? What happens there?
-+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
-+# magic is responsible for this. Processor overlaps not only the
-+# additional instructions with AES ones, but even AES instuctions
-+# processing adjacent triplets of independent blocks. In the 6x case
-+# additional instructions still claim disproportionally small amount
-+# of additional cycles, but in 8x case number of instructions must be
-+# a tad too high for out-of-order logic to cope with, and AES unit
-+# remains underutilized... As you can see 8x interleave is hardly
-+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
-+# utilizies 6x interleave because of limited register bank capacity.
-+#
-+# Higher interleave factors do have negative impact on Westmere
-+# performance. While for ECB mode it's negligible ~1.5%, other
-+# parallelizables perform ~5% worse, which is outweighed by ~25%
-+# improvement on Sandy Bridge. To balance regression on Westmere
-+# CTR mode was implemented with 6x aesenc interleave factor.
++# See aesni-x86_64.pl for details. Unlike x86_64 version this module
++# interleaves at most 6 aes[enc|dec] instructions, because there are
++# not enough registers for 8x interleave [which should be optimal for
++# Sandy Bridge]. Actually, performance results for 6x interleave
++# factor presented in aesni-x86_64.pl (except for CTR) are for this
++# module.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
-@@ -29,7 +168,7 @@ die "can't locate x86_64-xlate.pl";
+ # crypto/aes/asm/aes-586.pl:-)
++$inline=1; # inline _aesni_[en|de]crypt
- open STDOUT,"| $^X $xlate $flavour $output";
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ push(@INC,"${dir}","${dir}../../perlasm");
+@@ -22,7 +49,8 @@ require "x86asm.pl";
--$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
-+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
- @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+ &asm_init($ARGV[0],$0);
-@@ -41,18 +180,20 @@ $inp="%rdi";
- $out="%rsi";
- $len="%rdx";
- $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
--$ivp="%r8"; # cbc
-+$ivp="%r8"; # cbc, ctr, ...
+-$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
++if ($PREFIX eq "aesni") { $movekey=*movups; }
++else { $movekey=*movups; }
- $rnds_="%r10d"; # backup copy for $rounds
- $key_="%r11"; # backup copy for $key
+ $len="eax";
+ $rounds="ecx";
+@@ -32,114 +60,144 @@ $out="edi";
+ $rounds_="ebx"; # backup copy for $rounds
+ $key_="ebp"; # backup copy for $key
+
+-$inout0="xmm0";
+-$inout1="xmm1";
+-$inout2="xmm2";
+-$rndkey0="xmm3";
+-$rndkey1="xmm4";
+-$ivec="xmm5";
+-$in0="xmm6";
+-$in1="xmm7"; $inout3="xmm7";
+-
++$rndkey0="xmm0";
++$rndkey1="xmm1";
++$inout0="xmm2";
++$inout1="xmm3";
++$inout2="xmm4";
++$inout3="xmm5"; $in1="xmm5";
++$inout4="xmm6"; $in0="xmm6";
++$inout5="xmm7"; $ivec="xmm7";
++
++# AESNI extenstion
++sub aeskeygenassist
++{ my($dst,$src,$imm)=@_;
++ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++ { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
++}
++sub aescommon
++{ my($opcodelet,$dst,$src)=@_;
++ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++ { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
++}
++sub aesimc { aescommon(0xdb, at _); }
++sub aesenc { aescommon(0xdc, at _); }
++sub aesenclast { aescommon(0xdd, at _); }
++sub aesdec { aescommon(0xde, at _); }
++sub aesdeclast { aescommon(0xdf, at _); }
++
+ # Inline version of internal aesni_[en|de]crypt1
++{ my $sn;
+ sub aesni_inline_generate1
+-{ my $p=shift;
++{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
++ $sn++;
+
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(16,$key));
++ &xorps ($ivec,$rndkey0) if (defined($ivec));
+ &lea ($key,&DWP(32,$key));
+- &pxor ($inout0,$rndkey0);
+- &set_label("${p}1_loop");
+- eval"&aes${p} ($inout0,$rndkey1)";
++ &xorps ($inout,$ivec) if (defined($ivec));
++ &xorps ($inout,$rndkey0) if (!defined($ivec));
++ &set_label("${p}1_loop_$sn");
++ eval"&aes${p} ($inout,$rndkey1)";
+ &dec ($rounds);
+ &$movekey ($rndkey1,&QWP(0,$key));
+ &lea ($key,&DWP(16,$key));
+- &jnz (&label("${p}1_loop"));
+- eval"&aes${p}last ($inout0,$rndkey1)";
+-}
++ &jnz (&label("${p}1_loop_$sn"));
++ eval"&aes${p}last ($inout,$rndkey1)";
++}}
- # %xmm register layout
--$inout0="%xmm0"; $inout1="%xmm1";
--$inout2="%xmm2"; $inout3="%xmm3";
--$rndkey0="%xmm4"; $rndkey1="%xmm5";
-+$rndkey0="%xmm0"; $rndkey1="%xmm1";
-+$inout0="%xmm2"; $inout1="%xmm3";
-+$inout2="%xmm4"; $inout3="%xmm5";
-+$inout4="%xmm6"; $inout5="%xmm7";
-+$inout6="%xmm8"; $inout7="%xmm9";
+ sub aesni_generate1 # fully unrolled loop
+-{ my $p=shift;
++{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
--$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt
--$in1="%xmm8"; $in2="%xmm9";
-+$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
-+$in0="%xmm8"; $iv="%xmm9";
-
- # Inline version of internal aesni_[en|de]crypt1.
- #
-@@ -60,20 +201,29 @@ $in1="%xmm8"; $in2="%xmm9";
- # cycles which take care of loop variables...
- { my $sn;
- sub aesni_generate1 {
--my ($p,$key,$rounds)=@_;
-+my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
- ++$sn;
- $code.=<<___;
- $movkey ($key),$rndkey0
- $movkey 16($key),$rndkey1
-+___
-+$code.=<<___ if (defined($ivec));
-+ xorps $rndkey0,$ivec
- lea 32($key),$key
-- pxor $rndkey0,$inout0
-+ xorps $ivec,$inout
-+___
-+$code.=<<___ if (!defined($ivec));
-+ lea 32($key),$key
-+ xorps $rndkey0,$inout
-+___
-+$code.=<<___;
- .Loop_${p}1_$sn:
-- aes${p} $rndkey1,$inout0
-+ aes${p} $rndkey1,$inout
- dec $rounds
- $movkey ($key),$rndkey1
- lea 16($key),$key
- jnz .Loop_${p}1_$sn # loop body is 16 bytes
-- aes${p}last $rndkey1,$inout0
-+ aes${p}last $rndkey1,$inout
- ___
- }}
- # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
-@@ -86,7 +236,7 @@ $code.=<<___;
- .align 16
- ${PREFIX}_encrypt:
- movups ($inp),$inout0 # load input
-- mov 240($key),$rounds # pull $rounds
-+ mov 240($key),$rounds # key->rounds
- ___
- &aesni_generate1("enc",$key,$rounds);
- $code.=<<___;
-@@ -99,7 +249,7 @@ $code.=<<___;
- .align 16
- ${PREFIX}_decrypt:
- movups ($inp),$inout0 # load input
-- mov 240($key),$rounds # pull $rounds
-+ mov 240($key),$rounds # key->rounds
- ___
- &aesni_generate1("dec",$key,$rounds);
- $code.=<<___;
-@@ -109,16 +259,16 @@ $code.=<<___;
- ___
+ &function_begin_B("_aesni_${p}rypt1");
+- &$movekey ($rndkey0,&QWP(0,$key));
++ &movups ($rndkey0,&QWP(0,$key));
+ &$movekey ($rndkey1,&QWP(0x10,$key));
+- &cmp ($rounds,11);
+- &pxor ($inout0,$rndkey0);
++ &xorps ($inout,$rndkey0);
+ &$movekey ($rndkey0,&QWP(0x20,$key));
+ &lea ($key,&DWP(0x30,$key));
++ &cmp ($rounds,11);
+ &jb (&label("${p}128"));
+ &lea ($key,&DWP(0x20,$key));
+ &je (&label("${p}192"));
+ &lea ($key,&DWP(0x20,$key));
+- eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x40,$key));
+- eval"&aes${p} ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x30,$key));
+ &set_label("${p}192");
+- eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(-0x20,$key));
+- eval"&aes${p} ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(-0x10,$key));
+ &set_label("${p}128");
+- eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0,$key));
+- eval"&aes${p} ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x10,$key));
+- eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x20,$key));
+- eval"&aes${p} ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x30,$key));
+- eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x40,$key));
+- eval"&aes${p} ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x50,$key));
+- eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(0x60,$key));
+- eval"&aes${p} ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout,$rndkey0)";
+ &$movekey ($rndkey0,&QWP(0x70,$key));
+- eval"&aes${p} ($inout0,$rndkey1)";
+- eval"&aes${p}last ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout,$rndkey1)";
++ eval"&aes${p}last ($inout,$rndkey0)";
+ &ret();
+ &function_end_B("_aesni_${p}rypt1");
}
-
--# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
--# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
--# latency is 6, it turned out that it can be scheduled only every
--# *second* cycle. Thus 3x interleave is the one providing optimal
-+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
-+# factor. Why 3x subroutine were originally used in loops? Even though
-+# aes[enc|dec] latency was originally 6, it could be scheduled only
-+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
- # utilization, i.e. when subroutine's throughput is virtually same as
- # of non-interleaved subroutine [for number of input blocks up to 3].
--# This is why it makes no sense to implement 2x subroutine. As soon
--# as/if Intel improves throughput by making it possible to schedule
--# the instructions in question *every* cycles I would have to
--# implement 6x interleave and use it in loop...
-+# This is why it makes no sense to implement 2x subroutine.
-+# aes[enc|dec] latency in next processor generation is 8, but the
-+# instructions can be scheduled every cycle. Optimal interleave for
-+# new processor is therefore 8x...
- sub aesni_generate3 {
- my $dir=shift;
- # As already mentioned it takes in $key and $rounds, which are *not*
-@@ -131,25 +281,25 @@ _aesni_${dir}rypt3:
- shr \$1,$rounds
- $movkey 16($key),$rndkey1
- lea 32($key),$key
-- pxor $rndkey0,$inout0
-- pxor $rndkey0,$inout1
-- pxor $rndkey0,$inout2
-+ xorps $rndkey0,$inout0
-+ xorps $rndkey0,$inout1
-+ xorps $rndkey0,$inout2
-+ $movkey ($key),$rndkey0
-
- .L${dir}_loop3:
- aes${dir} $rndkey1,$inout0
-- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout1
- dec $rounds
- aes${dir} $rndkey1,$inout2
-- aes${dir} $rndkey0,$inout0
- $movkey 16($key),$rndkey1
-+ aes${dir} $rndkey0,$inout0
- aes${dir} $rndkey0,$inout1
- lea 32($key),$key
- aes${dir} $rndkey0,$inout2
-+ $movkey ($key),$rndkey0
- jnz .L${dir}_loop3
-
- aes${dir} $rndkey1,$inout0
-- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout1
- aes${dir} $rndkey1,$inout2
- aes${dir}last $rndkey0,$inout0
-@@ -175,28 +325,28 @@ _aesni_${dir}rypt4:
- shr \$1,$rounds
- $movkey 16($key),$rndkey1
- lea 32($key),$key
-- pxor $rndkey0,$inout0
-- pxor $rndkey0,$inout1
-- pxor $rndkey0,$inout2
-- pxor $rndkey0,$inout3
-+ xorps $rndkey0,$inout0
-+ xorps $rndkey0,$inout1
-+ xorps $rndkey0,$inout2
-+ xorps $rndkey0,$inout3
-+ $movkey ($key),$rndkey0
+-
++
+ # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+-# &aesni_generate1("dec");
++&aesni_generate1("enc") if (!$inline);
+ &function_begin_B("${PREFIX}_encrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+- &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
++ if ($inline)
++ { &aesni_inline_generate1("enc"); }
++ else
++ { &call ("_aesni_encrypt1"); }
+ &movups (&QWP(0,"eax"),$inout0);
+ &ret ();
+ &function_end_B("${PREFIX}_encrypt");
- .L${dir}_loop4:
- aes${dir} $rndkey1,$inout0
-- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout1
- dec $rounds
- aes${dir} $rndkey1,$inout2
- aes${dir} $rndkey1,$inout3
-- aes${dir} $rndkey0,$inout0
- $movkey 16($key),$rndkey1
-+ aes${dir} $rndkey0,$inout0
- aes${dir} $rndkey0,$inout1
- lea 32($key),$key
- aes${dir} $rndkey0,$inout2
- aes${dir} $rndkey0,$inout3
-+ $movkey ($key),$rndkey0
- jnz .L${dir}_loop4
+ # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+-# &aesni_generate1("dec");
++&aesni_generate1("dec") if(!$inline);
+ &function_begin_B("${PREFIX}_decrypt");
+ &mov ("eax",&wparam(0));
+ &mov ($key,&wparam(2));
+ &movups ($inout0,&QWP(0,"eax"));
+ &mov ($rounds,&DWP(240,$key));
+ &mov ("eax",&wparam(1));
+- &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1");
++ if ($inline)
++ { &aesni_inline_generate1("dec"); }
++ else
++ { &call ("_aesni_decrypt1"); }
+ &movups (&QWP(0,"eax"),$inout0);
+ &ret ();
+ &function_end_B("${PREFIX}_decrypt");
+-
+-# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
+-# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
+-# latency is 6, it turned out that it can be scheduled only every
+-# *second* cycle. Thus 3x interleave is the one providing optimal
++
++# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
++# factor. Why 3x subroutine were originally used in loops? Even though
++# aes[enc|dec] latency was originally 6, it could be scheduled only
++# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+ # utilization, i.e. when subroutine's throughput is virtually same as
+ # of non-interleaved subroutine [for number of input blocks up to 3].
+-# This is why it makes no sense to implement 2x subroutine. As soon
+-# as/if Intel improves throughput by making it possible to schedule
+-# the instructions in question *every* cycles I would have to
+-# implement 6x interleave and use it in loop...
++# This is why it makes no sense to implement 2x subroutine.
++# aes[enc|dec] latency in next processor generation is 8, but the
++# instructions can be scheduled every cycle. Optimal interleave for
++# new processor is therefore 8x, but it's unfeasible to accommodate it
++# in XMM registers addreassable in 32-bit mode and therefore 6x is
++# used instead...
++
+ sub aesni_generate3
+ { my $p=shift;
- aes${dir} $rndkey1,$inout0
-- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout1
- aes${dir} $rndkey1,$inout2
- aes${dir} $rndkey1,$inout3
-@@ -208,12 +358,158 @@ _aesni_${dir}rypt4:
- .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
- ___
+@@ -148,24 +206,24 @@ sub aesni_generate3
+ &shr ($rounds,1);
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &lea ($key,&DWP(32,$key));
+- &pxor ($inout0,$rndkey0);
++ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+- &jmp (&label("${p}3_loop"));
+- &set_label("${p}3_loop",16);
+- eval"&aes${p} ($inout0,$rndkey1)";
+ &$movekey ($rndkey0,&QWP(0,$key));
++
++ &set_label("${p}3_loop");
++ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+- &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout1,$rndkey0)";
++ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(0,$key));
+ &jnz (&label("${p}3_loop"));
+ eval"&aes${p} ($inout0,$rndkey1)";
+- &$movekey ($rndkey0,&QWP(0,$key));
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p}last ($inout0,$rndkey0)";
+@@ -187,27 +245,28 @@ sub aesni_generate4
+ &$movekey ($rndkey1,&QWP(16,$key));
+ &shr ($rounds,1);
+ &lea ($key,&DWP(32,$key));
+- &pxor ($inout0,$rndkey0);
++ &xorps ($inout0,$rndkey0);
+ &pxor ($inout1,$rndkey0);
+ &pxor ($inout2,$rndkey0);
+ &pxor ($inout3,$rndkey0);
+- &jmp (&label("${p}3_loop"));
+- &set_label("${p}3_loop",16);
+- eval"&aes${p} ($inout0,$rndkey1)";
+ &$movekey ($rndkey0,&QWP(0,$key));
++
++ &set_label("${p}4_loop");
++ eval"&aes${p} ($inout0,$rndkey1)";
+ eval"&aes${p} ($inout1,$rndkey1)";
+ &dec ($rounds);
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+ &$movekey ($rndkey1,&QWP(16,$key));
+ eval"&aes${p} ($inout0,$rndkey0)";
+- &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout1,$rndkey0)";
++ &lea ($key,&DWP(32,$key));
+ eval"&aes${p} ($inout2,$rndkey0)";
+ eval"&aes${p} ($inout3,$rndkey0)";
+- &jnz (&label("${p}3_loop"));
++ &$movekey ($rndkey0,&QWP(0,$key));
++ &jnz (&label("${p}4_loop"));
++
+ eval"&aes${p} ($inout0,$rndkey1)";
+- &$movekey ($rndkey0,&QWP(0,$key));
+ eval"&aes${p} ($inout1,$rndkey1)";
+ eval"&aes${p} ($inout2,$rndkey1)";
+ eval"&aes${p} ($inout3,$rndkey1)";
+@@ -218,12 +277,76 @@ sub aesni_generate4
+ &ret();
+ &function_end_B("_aesni_${p}rypt4");
}
-+sub aesni_generate6 {
-+my $dir=shift;
-+# As already mentioned it takes in $key and $rounds, which are *not*
-+# preserved. $inout[0-5] is cipher/clear text...
-+$code.=<<___;
-+.type _aesni_${dir}rypt6,\@abi-omnipotent
-+.align 16
-+_aesni_${dir}rypt6:
-+ $movkey ($key),$rndkey0
-+ shr \$1,$rounds
-+ $movkey 16($key),$rndkey1
-+ lea 32($key),$key
-+ xorps $rndkey0,$inout0
-+ pxor $rndkey0,$inout1
-+ aes${dir} $rndkey1,$inout0
-+ pxor $rndkey0,$inout2
-+ aes${dir} $rndkey1,$inout1
-+ pxor $rndkey0,$inout3
-+ aes${dir} $rndkey1,$inout2
-+ pxor $rndkey0,$inout4
-+ aes${dir} $rndkey1,$inout3
-+ pxor $rndkey0,$inout5
-+ dec $rounds
-+ aes${dir} $rndkey1,$inout4
-+ $movkey ($key),$rndkey0
-+ aes${dir} $rndkey1,$inout5
-+ jmp .L${dir}_loop6_enter
-+.align 16
-+.L${dir}_loop6:
-+ aes${dir} $rndkey1,$inout0
-+ aes${dir} $rndkey1,$inout1
-+ dec $rounds
-+ aes${dir} $rndkey1,$inout2
-+ aes${dir} $rndkey1,$inout3
-+ aes${dir} $rndkey1,$inout4
-+ aes${dir} $rndkey1,$inout5
-+.L${dir}_loop6_enter: # happens to be 16-byte aligned
-+ $movkey 16($key),$rndkey1
-+ aes${dir} $rndkey0,$inout0
-+ aes${dir} $rndkey0,$inout1
-+ lea 32($key),$key
-+ aes${dir} $rndkey0,$inout2
-+ aes${dir} $rndkey0,$inout3
-+ aes${dir} $rndkey0,$inout4
-+ aes${dir} $rndkey0,$inout5
-+ $movkey ($key),$rndkey0
-+ jnz .L${dir}_loop6
+
-+ aes${dir} $rndkey1,$inout0
-+ aes${dir} $rndkey1,$inout1
-+ aes${dir} $rndkey1,$inout2
-+ aes${dir} $rndkey1,$inout3
-+ aes${dir} $rndkey1,$inout4
-+ aes${dir} $rndkey1,$inout5
-+ aes${dir}last $rndkey0,$inout0
-+ aes${dir}last $rndkey0,$inout1
-+ aes${dir}last $rndkey0,$inout2
-+ aes${dir}last $rndkey0,$inout3
-+ aes${dir}last $rndkey0,$inout4
-+ aes${dir}last $rndkey0,$inout5
-+ ret
-+.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
-+___
-+}
-+sub aesni_generate8 {
-+my $dir=shift;
-+# As already mentioned it takes in $key and $rounds, which are *not*
-+# preserved. $inout[0-7] is cipher/clear text...
-+$code.=<<___;
-+.type _aesni_${dir}rypt8,\@abi-omnipotent
-+.align 16
-+_aesni_${dir}rypt8:
-+ $movkey ($key),$rndkey0
-+ shr \$1,$rounds
-+ $movkey 16($key),$rndkey1
-+ lea 32($key),$key
-+ xorps $rndkey0,$inout0
-+ xorps $rndkey0,$inout1
-+ aes${dir} $rndkey1,$inout0
-+ pxor $rndkey0,$inout2
-+ aes${dir} $rndkey1,$inout1
-+ pxor $rndkey0,$inout3
-+ aes${dir} $rndkey1,$inout2
-+ pxor $rndkey0,$inout4
-+ aes${dir} $rndkey1,$inout3
-+ pxor $rndkey0,$inout5
-+ dec $rounds
-+ aes${dir} $rndkey1,$inout4
-+ pxor $rndkey0,$inout6
-+ aes${dir} $rndkey1,$inout5
-+ pxor $rndkey0,$inout7
-+ $movkey ($key),$rndkey0
-+ aes${dir} $rndkey1,$inout6
-+ aes${dir} $rndkey1,$inout7
-+ $movkey 16($key),$rndkey1
-+ jmp .L${dir}_loop8_enter
-+.align 16
-+.L${dir}_loop8:
-+ aes${dir} $rndkey1,$inout0
-+ aes${dir} $rndkey1,$inout1
-+ dec $rounds
-+ aes${dir} $rndkey1,$inout2
-+ aes${dir} $rndkey1,$inout3
-+ aes${dir} $rndkey1,$inout4
-+ aes${dir} $rndkey1,$inout5
-+ aes${dir} $rndkey1,$inout6
-+ aes${dir} $rndkey1,$inout7
-+ $movkey 16($key),$rndkey1
-+.L${dir}_loop8_enter: # happens to be 16-byte aligned
-+ aes${dir} $rndkey0,$inout0
-+ aes${dir} $rndkey0,$inout1
-+ lea 32($key),$key
-+ aes${dir} $rndkey0,$inout2
-+ aes${dir} $rndkey0,$inout3
-+ aes${dir} $rndkey0,$inout4
-+ aes${dir} $rndkey0,$inout5
-+ aes${dir} $rndkey0,$inout6
-+ aes${dir} $rndkey0,$inout7
-+ $movkey ($key),$rndkey0
-+ jnz .L${dir}_loop8
++sub aesni_generate6
++{ my $p=shift;
++
++ &function_begin_B("_aesni_${p}rypt6");
++ &static_label("_aesni_${p}rypt6_enter");
++ &$movekey ($rndkey0,&QWP(0,$key));
++ &shr ($rounds,1);
++ &$movekey ($rndkey1,&QWP(16,$key));
++ &lea ($key,&DWP(32,$key));
++ &xorps ($inout0,$rndkey0);
++ &pxor ($inout1,$rndkey0); # pxor does better here
++ eval"&aes${p} ($inout0,$rndkey1)";
++ &pxor ($inout2,$rndkey0);
++ eval"&aes${p} ($inout1,$rndkey1)";
++ &pxor ($inout3,$rndkey0);
++ &dec ($rounds);
++ eval"&aes${p} ($inout2,$rndkey1)";
++ &pxor ($inout4,$rndkey0);
++ eval"&aes${p} ($inout3,$rndkey1)";
++ &pxor ($inout5,$rndkey0);
++ eval"&aes${p} ($inout4,$rndkey1)";
++ &$movekey ($rndkey0,&QWP(0,$key));
++ eval"&aes${p} ($inout5,$rndkey1)";
++ &jmp (&label("_aesni_${p}rypt6_enter"));
++
++ &set_label("${p}6_loop",16);
++ eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout1,$rndkey1)";
++ &dec ($rounds);
++ eval"&aes${p} ($inout2,$rndkey1)";
++ eval"&aes${p} ($inout3,$rndkey1)";
++ eval"&aes${p} ($inout4,$rndkey1)";
++ eval"&aes${p} ($inout5,$rndkey1)";
++ &set_label("_aesni_${p}rypt6_enter",16);
++ &$movekey ($rndkey1,&QWP(16,$key));
++ eval"&aes${p} ($inout0,$rndkey0)";
++ eval"&aes${p} ($inout1,$rndkey0)";
++ &lea ($key,&DWP(32,$key));
++ eval"&aes${p} ($inout2,$rndkey0)";
++ eval"&aes${p} ($inout3,$rndkey0)";
++ eval"&aes${p} ($inout4,$rndkey0)";
++ eval"&aes${p} ($inout5,$rndkey0)";
++ &$movekey ($rndkey0,&QWP(0,$key));
++ &jnz (&label("${p}6_loop"));
+
-+ aes${dir} $rndkey1,$inout0
-+ aes${dir} $rndkey1,$inout1
-+ aes${dir} $rndkey1,$inout2
-+ aes${dir} $rndkey1,$inout3
-+ aes${dir} $rndkey1,$inout4
-+ aes${dir} $rndkey1,$inout5
-+ aes${dir} $rndkey1,$inout6
-+ aes${dir} $rndkey1,$inout7
-+ aes${dir}last $rndkey0,$inout0
-+ aes${dir}last $rndkey0,$inout1
-+ aes${dir}last $rndkey0,$inout2
-+ aes${dir}last $rndkey0,$inout3
-+ aes${dir}last $rndkey0,$inout4
-+ aes${dir}last $rndkey0,$inout5
-+ aes${dir}last $rndkey0,$inout6
-+ aes${dir}last $rndkey0,$inout7
-+ ret
-+.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
-+___
++ eval"&aes${p} ($inout0,$rndkey1)";
++ eval"&aes${p} ($inout1,$rndkey1)";
++ eval"&aes${p} ($inout2,$rndkey1)";
++ eval"&aes${p} ($inout3,$rndkey1)";
++ eval"&aes${p} ($inout4,$rndkey1)";
++ eval"&aes${p} ($inout5,$rndkey1)";
++ eval"&aes${p}last ($inout0,$rndkey0)";
++ eval"&aes${p}last ($inout1,$rndkey0)";
++ eval"&aes${p}last ($inout2,$rndkey0)";
++ eval"&aes${p}last ($inout3,$rndkey0)";
++ eval"&aes${p}last ($inout4,$rndkey0)";
++ eval"&aes${p}last ($inout5,$rndkey0)";
++ &ret();
++ &function_end_B("_aesni_${p}rypt6");
+}
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
+-
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
-+&aesni_generate8("enc") if ($PREFIX eq "aesni");
-+&aesni_generate8("dec");
-
++
if ($PREFIX eq "aesni") {
-+########################################################################
++######################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
- # size_t length, const AES_KEY *key,
- # int enc);
-@@ -222,54 +518,98 @@ $code.=<<___;
- .type aesni_ecb_encrypt,\@function,5
- .align 16
- aesni_ecb_encrypt:
-- cmp \$16,$len # check length
-- jb .Lecb_ret
--
-- mov 240($key),$rounds # pull $rounds
- and \$-16,$len
-+ jz .Lecb_ret
-+
-+ mov 240($key),$rounds # key->rounds
-+ $movkey ($key),$rndkey0
- mov $key,$key_ # backup $key
-- test %r8d,%r8d # 5th argument
- mov $rounds,$rnds_ # backup $rounds
-+ test %r8d,%r8d # 5th argument
- jz .Lecb_decrypt
- #--------------------------- ECB ENCRYPT ------------------------------#
-- sub \$0x40,$len
-- jbe .Lecb_enc_tail
-- jmp .Lecb_enc_loop3
-+ cmp \$0x80,$len
-+ jb .Lecb_enc_tail
+ # size_t length, const AES_KEY *key,
+ # int enc);
+@@ -232,62 +355,93 @@ if ($PREFIX eq "aesni") {
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+- &mov ($rounds,&wparam(4));
+- &cmp ($len,16);
+- &jb (&label("ecb_ret"));
++ &mov ($rounds_,&wparam(4));
+ &and ($len,-16);
+- &test ($rounds,$rounds)
++ &jz (&label("ecb_ret"));
+ &mov ($rounds,&DWP(240,$key));
++ &test ($rounds_,$rounds_);
++ &jz (&label("ecb_decrypt"));
+
-+ movdqu ($inp),$inout0
-+ movdqu 0x10($inp),$inout1
-+ movdqu 0x20($inp),$inout2
-+ movdqu 0x30($inp),$inout3
-+ movdqu 0x40($inp),$inout4
-+ movdqu 0x50($inp),$inout5
-+ movdqu 0x60($inp),$inout6
-+ movdqu 0x70($inp),$inout7
-+ lea 0x80($inp),$inp
-+ sub \$0x80,$len
-+ jmp .Lecb_enc_loop8_enter
- .align 16
--.Lecb_enc_loop3:
-- movups ($inp),$inout0
-- movups 0x10($inp),$inout1
-- movups 0x20($inp),$inout2
-- call _aesni_encrypt3
-- sub \$0x30,$len
-- lea 0x30($inp),$inp
-- lea 0x30($out),$out
-- movups $inout0,-0x30($out)
-- mov $rnds_,$rounds # restore $rounds
-- movups $inout1,-0x20($out)
-+.Lecb_enc_loop8:
-+ movups $inout0,($out)
- mov $key_,$key # restore $key
-- movups $inout2,-0x10($out)
-- ja .Lecb_enc_loop3
-+ movdqu ($inp),$inout0
-+ mov $rnds_,$rounds # restore $rounds
-+ movups $inout1,0x10($out)
-+ movdqu 0x10($inp),$inout1
-+ movups $inout2,0x20($out)
-+ movdqu 0x20($inp),$inout2
-+ movups $inout3,0x30($out)
-+ movdqu 0x30($inp),$inout3
-+ movups $inout4,0x40($out)
-+ movdqu 0x40($inp),$inout4
-+ movups $inout5,0x50($out)
-+ movdqu 0x50($inp),$inout5
-+ movups $inout6,0x60($out)
-+ movdqu 0x60($inp),$inout6
-+ movups $inout7,0x70($out)
-+ lea 0x80($out),$out
-+ movdqu 0x70($inp),$inout7
-+ lea 0x80($inp),$inp
-+.Lecb_enc_loop8_enter:
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+- &jz (&label("ecb_decrypt"));
++ &cmp ($len,0x60);
++ &jb (&label("ecb_enc_tail"));
--.Lecb_enc_tail:
-- add \$0x40,$len
-+ call _aesni_encrypt8
+- &sub ($len,0x40);
+- &jbe (&label("ecb_enc_tail"));
+- &jmp (&label("ecb_enc_loop3"));
++ &movdqu ($inout0,&QWP(0,$inp));
++ &movdqu ($inout1,&QWP(0x10,$inp));
++ &movdqu ($inout2,&QWP(0x20,$inp));
++ &movdqu ($inout3,&QWP(0x30,$inp));
++ &movdqu ($inout4,&QWP(0x40,$inp));
++ &movdqu ($inout5,&QWP(0x50,$inp));
++ &lea ($inp,&DWP(0x60,$inp));
++ &sub ($len,0x60);
++ &jmp (&label("ecb_enc_loop6_enter"));
+
-+ sub \$0x80,$len
-+ jnc .Lecb_enc_loop8
++&set_label("ecb_enc_loop6",16);
++ &movups (&QWP(0,$out),$inout0);
++ &movdqu ($inout0,&QWP(0,$inp));
++ &movups (&QWP(0x10,$out),$inout1);
++ &movdqu ($inout1,&QWP(0x10,$inp));
++ &movups (&QWP(0x20,$out),$inout2);
++ &movdqu ($inout2,&QWP(0x20,$inp));
++ &movups (&QWP(0x30,$out),$inout3);
++ &movdqu ($inout3,&QWP(0x30,$inp));
++ &movups (&QWP(0x40,$out),$inout4);
++ &movdqu ($inout4,&QWP(0x40,$inp));
++ &movups (&QWP(0x50,$out),$inout5);
++ &lea ($out,&DWP(0x60,$out));
++ &movdqu ($inout5,&QWP(0x50,$inp));
++ &lea ($inp,&DWP(0x60,$inp));
++&set_label("ecb_enc_loop6_enter");
+
-+ movups $inout0,($out)
-+ mov $key_,$key # restore $key
-+ movups $inout1,0x10($out)
-+ mov $rnds_,$rounds # restore $rounds
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ movups $inout5,0x50($out)
-+ movups $inout6,0x60($out)
-+ movups $inout7,0x70($out)
-+ lea 0x80($out),$out
-+ add \$0x80,$len
- jz .Lecb_ret
++ &call ("_aesni_encrypt6");
+
+-&set_label("ecb_enc_loop3",16);
+- &movups ($inout0,&QWP(0,$inp));
+- &movups ($inout1,&QWP(0x10,$inp));
+- &movups ($inout2,&QWP(0x20,$inp));
+- &call ("_aesni_encrypt3");
+- &sub ($len,0x30);
+- &lea ($inp,&DWP(0x30,$inp));
+- &lea ($out,&DWP(0x30,$out));
+- &movups (&QWP(-0x30,$out),$inout0);
+ &mov ($key,$key_); # restore $key
+- &movups (&QWP(-0x20,$out),$inout1);
+ &mov ($rounds,$rounds_); # restore $rounds
+- &movups (&QWP(-0x10,$out),$inout2);
+- &ja (&label("ecb_enc_loop3"));
++ &sub ($len,0x60);
++ &jnc (&label("ecb_enc_loop6"));
+
+-&set_label("ecb_enc_tail");
+- &add ($len,0x40);
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &movups (&QWP(0x30,$out),$inout3);
++ &movups (&QWP(0x40,$out),$inout4);
++ &movups (&QWP(0x50,$out),$inout5);
++ &lea ($out,&DWP(0x60,$out));
++ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+- &cmp ($len,0x10);
++&set_label("ecb_enc_tail");
+ &movups ($inout0,&QWP(0,$inp));
+- &je (&label("ecb_enc_one"));
+ &cmp ($len,0x20);
++ &jb (&label("ecb_enc_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_enc_two"));
+- &cmp ($len,0x30);
+ &movups ($inout2,&QWP(0x20,$inp));
+- &je (&label("ecb_enc_three"));
++ &cmp ($len,0x40);
++ &jb (&label("ecb_enc_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+- &call ("_aesni_encrypt4");
++ &je (&label("ecb_enc_four"));
++ &movups ($inout4,&QWP(0x40,$inp));
++ &xorps ($inout5,$inout5);
++ &call ("_aesni_encrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
++ &movups (&QWP(0x40,$out),$inout4);
+ jmp (&label("ecb_ret"));
+
+ &set_label("ecb_enc_one",16);
+- &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
++ if ($inline)
++ { &aesni_inline_generate1("enc"); }
++ else
++ { &call ("_aesni_encrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
-- cmp \$0x10,$len
-+.Lecb_enc_tail:
- movups ($inp),$inout0
-- je .Lecb_enc_one
- cmp \$0x20,$len
-+ jb .Lecb_enc_one
- movups 0x10($inp),$inout1
- je .Lecb_enc_two
-- cmp \$0x30,$len
- movups 0x20($inp),$inout2
-- je .Lecb_enc_three
-+ cmp \$0x40,$len
-+ jb .Lecb_enc_three
- movups 0x30($inp),$inout3
-- call _aesni_encrypt4
-+ je .Lecb_enc_four
-+ movups 0x40($inp),$inout4
-+ cmp \$0x60,$len
-+ jb .Lecb_enc_five
-+ movups 0x50($inp),$inout5
-+ je .Lecb_enc_six
-+ movdqu 0x60($inp),$inout6
-+ call _aesni_encrypt8
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ movups $inout5,0x50($out)
-+ movups $inout6,0x60($out)
- jmp .Lecb_ret
- .align 16
- .Lecb_enc_one:
-@@ -280,6 +620,7 @@ $code.=<<___;
- jmp .Lecb_ret
- .align 16
- .Lecb_enc_two:
-+ xorps $inout2,$inout2
- call _aesni_encrypt3
- movups $inout0,($out)
- movups $inout1,0x10($out)
-@@ -291,47 +632,121 @@ $code.=<<___;
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- jmp .Lecb_ret
-+.align 16
-+.Lecb_enc_four:
-+ call _aesni_encrypt4
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ jmp .Lecb_ret
-+.align 16
-+.Lecb_enc_five:
-+ xorps $inout5,$inout5
-+ call _aesni_encrypt6
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ jmp .Lecb_ret
-+.align 16
-+.Lecb_enc_six:
-+ call _aesni_encrypt6
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ movups $inout5,0x50($out)
-+ jmp .Lecb_ret
- #--------------------------- ECB DECRYPT ------------------------------#
- .align 16
- .Lecb_decrypt:
-- sub \$0x40,$len
-- jbe .Lecb_dec_tail
-- jmp .Lecb_dec_loop3
-+ cmp \$0x80,$len
-+ jb .Lecb_dec_tail
+ &set_label("ecb_enc_two",16);
++ &xorps ($inout2,$inout2);
+ &call ("_aesni_encrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+@@ -300,53 +454,95 @@ if ($PREFIX eq "aesni") {
+ &movups (&QWP(0x20,$out),$inout2);
+ &jmp (&label("ecb_ret"));
+
++&set_label("ecb_enc_four",16);
++ &call ("_aesni_encrypt4");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &movups (&QWP(0x30,$out),$inout3);
++ &jmp (&label("ecb_ret"));
++######################################################################
+ &set_label("ecb_decrypt",16);
+- &sub ($len,0x40);
+- &jbe (&label("ecb_dec_tail"));
+- &jmp (&label("ecb_dec_loop3"));
++ &mov ($key_,$key); # backup $key
++ &mov ($rounds_,$rounds); # backup $rounds
++ &cmp ($len,0x60);
++ &jb (&label("ecb_dec_tail"));
+
-+ movdqu ($inp),$inout0
-+ movdqu 0x10($inp),$inout1
-+ movdqu 0x20($inp),$inout2
-+ movdqu 0x30($inp),$inout3
-+ movdqu 0x40($inp),$inout4
-+ movdqu 0x50($inp),$inout5
-+ movdqu 0x60($inp),$inout6
-+ movdqu 0x70($inp),$inout7
-+ lea 0x80($inp),$inp
-+ sub \$0x80,$len
-+ jmp .Lecb_dec_loop8_enter
- .align 16
--.Lecb_dec_loop3:
-- movups ($inp),$inout0
-- movups 0x10($inp),$inout1
-- movups 0x20($inp),$inout2
-- call _aesni_decrypt3
-- sub \$0x30,$len
-- lea 0x30($inp),$inp
-- lea 0x30($out),$out
-- movups $inout0,-0x30($out)
-- mov $rnds_,$rounds # restore $rounds
-- movups $inout1,-0x20($out)
-+.Lecb_dec_loop8:
-+ movups $inout0,($out)
- mov $key_,$key # restore $key
-- movups $inout2,-0x10($out)
-- ja .Lecb_dec_loop3
-+ movdqu ($inp),$inout0
-+ mov $rnds_,$rounds # restore $rounds
-+ movups $inout1,0x10($out)
-+ movdqu 0x10($inp),$inout1
-+ movups $inout2,0x20($out)
-+ movdqu 0x20($inp),$inout2
-+ movups $inout3,0x30($out)
-+ movdqu 0x30($inp),$inout3
-+ movups $inout4,0x40($out)
-+ movdqu 0x40($inp),$inout4
-+ movups $inout5,0x50($out)
-+ movdqu 0x50($inp),$inout5
-+ movups $inout6,0x60($out)
-+ movdqu 0x60($inp),$inout6
-+ movups $inout7,0x70($out)
-+ lea 0x80($out),$out
-+ movdqu 0x70($inp),$inout7
-+ lea 0x80($inp),$inp
-+.Lecb_dec_loop8_enter:
++ &movdqu ($inout0,&QWP(0,$inp));
++ &movdqu ($inout1,&QWP(0x10,$inp));
++ &movdqu ($inout2,&QWP(0x20,$inp));
++ &movdqu ($inout3,&QWP(0x30,$inp));
++ &movdqu ($inout4,&QWP(0x40,$inp));
++ &movdqu ($inout5,&QWP(0x50,$inp));
++ &lea ($inp,&DWP(0x60,$inp));
++ &sub ($len,0x60);
++ &jmp (&label("ecb_dec_loop6_enter"));
+
-+ call _aesni_decrypt8
++&set_label("ecb_dec_loop6",16);
++ &movups (&QWP(0,$out),$inout0);
++ &movdqu ($inout0,&QWP(0,$inp));
++ &movups (&QWP(0x10,$out),$inout1);
++ &movdqu ($inout1,&QWP(0x10,$inp));
++ &movups (&QWP(0x20,$out),$inout2);
++ &movdqu ($inout2,&QWP(0x20,$inp));
++ &movups (&QWP(0x30,$out),$inout3);
++ &movdqu ($inout3,&QWP(0x30,$inp));
++ &movups (&QWP(0x40,$out),$inout4);
++ &movdqu ($inout4,&QWP(0x40,$inp));
++ &movups (&QWP(0x50,$out),$inout5);
++ &lea ($out,&DWP(0x60,$out));
++ &movdqu ($inout5,&QWP(0x50,$inp));
++ &lea ($inp,&DWP(0x60,$inp));
++&set_label("ecb_dec_loop6_enter");
+
-+ $movkey ($key_),$rndkey0
-+ sub \$0x80,$len
-+ jnc .Lecb_dec_loop8
++ &call ("_aesni_decrypt6");
--.Lecb_dec_tail:
-- add \$0x40,$len
-+ movups $inout0,($out)
-+ mov $key_,$key # restore $key
-+ movups $inout1,0x10($out)
-+ mov $rnds_,$rounds # restore $rounds
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ movups $inout5,0x50($out)
-+ movups $inout6,0x60($out)
-+ movups $inout7,0x70($out)
-+ lea 0x80($out),$out
-+ add \$0x80,$len
- jz .Lecb_ret
+-&set_label("ecb_dec_loop3",16);
+- &movups ($inout0,&QWP(0,$inp));
+- &movups ($inout1,&QWP(0x10,$inp));
+- &movups ($inout2,&QWP(0x20,$inp));
+- &call ("_aesni_decrypt3");
+- &sub ($len,0x30);
+- &lea ($inp,&DWP(0x30,$inp));
+- &lea ($out,&DWP(0x30,$out));
+- &movups (&QWP(-0x30,$out),$inout0);
+ &mov ($key,$key_); # restore $key
+- &movups (&QWP(-0x20,$out),$inout1);
+ &mov ($rounds,$rounds_); # restore $rounds
+- &movups (&QWP(-0x10,$out),$inout2);
+- &ja (&label("ecb_dec_loop3"));
++ &sub ($len,0x60);
++ &jnc (&label("ecb_dec_loop6"));
+
+-&set_label("ecb_dec_tail");
+- &add ($len,0x40);
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &movups (&QWP(0x30,$out),$inout3);
++ &movups (&QWP(0x40,$out),$inout4);
++ &movups (&QWP(0x50,$out),$inout5);
++ &lea ($out,&DWP(0x60,$out));
++ &add ($len,0x60);
+ &jz (&label("ecb_ret"));
+
+- &cmp ($len,0x10);
++&set_label("ecb_dec_tail");
+ &movups ($inout0,&QWP(0,$inp));
+- &je (&label("ecb_dec_one"));
+ &cmp ($len,0x20);
++ &jb (&label("ecb_dec_one"));
+ &movups ($inout1,&QWP(0x10,$inp));
+ &je (&label("ecb_dec_two"));
+- &cmp ($len,0x30);
+ &movups ($inout2,&QWP(0x20,$inp));
+- &je (&label("ecb_dec_three"));
++ &cmp ($len,0x40);
++ &jb (&label("ecb_dec_three"));
+ &movups ($inout3,&QWP(0x30,$inp));
+- &call ("_aesni_decrypt4");
++ &je (&label("ecb_dec_four"));
++ &movups ($inout4,&QWP(0x40,$inp));
++ &xorps ($inout5,$inout5);
++ &call ("_aesni_decrypt6");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+ &movups (&QWP(0x30,$out),$inout3);
++ &movups (&QWP(0x40,$out),$inout4);
+ &jmp (&label("ecb_ret"));
+
+ &set_label("ecb_dec_one",16);
+- &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
++ if ($inline)
++ { &aesni_inline_generate1("dec"); }
++ else
++ { &call ("_aesni_decrypt1"); }
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("ecb_ret"));
-- cmp \$0x10,$len
-+.Lecb_dec_tail:
- movups ($inp),$inout0
-- je .Lecb_dec_one
- cmp \$0x20,$len
-+ jb .Lecb_dec_one
- movups 0x10($inp),$inout1
- je .Lecb_dec_two
-- cmp \$0x30,$len
- movups 0x20($inp),$inout2
-- je .Lecb_dec_three
-+ cmp \$0x40,$len
-+ jb .Lecb_dec_three
- movups 0x30($inp),$inout3
-- call _aesni_decrypt4
-+ je .Lecb_dec_four
-+ movups 0x40($inp),$inout4
-+ cmp \$0x60,$len
-+ jb .Lecb_dec_five
-+ movups 0x50($inp),$inout5
-+ je .Lecb_dec_six
-+ movups 0x60($inp),$inout6
-+ $movkey ($key),$rndkey0
-+ call _aesni_decrypt8
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ movups $inout5,0x50($out)
-+ movups $inout6,0x60($out)
- jmp .Lecb_ret
- .align 16
- .Lecb_dec_one:
-@@ -342,6 +757,7 @@ $code.=<<___;
- jmp .Lecb_ret
- .align 16
- .Lecb_dec_two:
-+ xorps $inout2,$inout2
- call _aesni_decrypt3
- movups $inout0,($out)
- movups $inout1,0x10($out)
-@@ -352,6 +768,34 @@ $code.=<<___;
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
-+ jmp .Lecb_ret
-+.align 16
-+.Lecb_dec_four:
-+ call _aesni_decrypt4
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ jmp .Lecb_ret
-+.align 16
-+.Lecb_dec_five:
-+ xorps $inout5,$inout5
-+ call _aesni_decrypt6
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ jmp .Lecb_ret
-+.align 16
-+.Lecb_dec_six:
-+ call _aesni_decrypt6
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ movups $inout5,0x50($out)
+ &set_label("ecb_dec_two",16);
++ &xorps ($inout2,$inout2);
+ &call ("_aesni_decrypt3");
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+@@ -357,28 +553,42 @@ if ($PREFIX eq "aesni") {
+ &movups (&QWP(0,$out),$inout0);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
++ &jmp (&label("ecb_ret"));
++
++&set_label("ecb_dec_four",16);
++ &call ("_aesni_decrypt4");
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &movups (&QWP(0x20,$out),$inout2);
++ &movups (&QWP(0x30,$out),$inout3);
- .Lecb_ret:
- ret
-@@ -362,7 +806,8 @@ ___
+ &set_label("ecb_ret");
+ &function_end("aesni_ecb_encrypt");
+ }
+
++######################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
- # size_t length, const AES_KEY *key,
- # unsigned char *ivp,const int enc);
--$reserved = $win64?0x40:-0x18; # used in decrypt
-+{
-+my $reserved = $win64?0x40:-0x18; # used in decrypt
- $code.=<<___;
- .globl ${PREFIX}_cbc_encrypt
- .type ${PREFIX}_cbc_encrypt,\@function,6
-@@ -371,30 +816,30 @@ ${PREFIX}_cbc_encrypt:
- test $len,$len # check length
- jz .Lcbc_ret
+ # size_t length, const AES_KEY *key,
+ # unsigned char *ivp,const int enc);
+ &function_begin("${PREFIX}_cbc_encrypt");
+ &mov ($inp,&wparam(0));
++ &mov ($rounds_,"esp");
+ &mov ($out,&wparam(1));
++ &sub ($rounds_,24);
+ &mov ($len,&wparam(2));
++ &and ($rounds_,-16);
+ &mov ($key,&wparam(3));
+- &test ($len,$len);
+ &mov ($key_,&wparam(4));
+- &jz (&label("cbc_ret"));
++ &test ($len,$len);
++ &jz (&label("cbc_abort"));
-- mov 240($key),$rnds_ # pull $rounds
-+ mov 240($key),$rnds_ # key->rounds
- mov $key,$key_ # backup $key
- test %r9d,%r9d # 6th argument
- jz .Lcbc_decrypt
- #--------------------------- CBC ENCRYPT ------------------------------#
- movups ($ivp),$inout0 # load iv as initial state
-- cmp \$16,$len
- mov $rnds_,$rounds
-+ cmp \$16,$len
- jb .Lcbc_enc_tail
- sub \$16,$len
- jmp .Lcbc_enc_loop
--.align 16
-+.align 16
- .Lcbc_enc_loop:
- movups ($inp),$inout1 # load input
- lea 16($inp),$inp
-- pxor $inout1,$inout0
-+ #xorps $inout1,$inout0
- ___
-- &aesni_generate1("enc",$key,$rounds);
-+ &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
- $code.=<<___;
-- sub \$16,$len
-- lea 16($out),$out
- mov $rnds_,$rounds # restore $rounds
- mov $key_,$key # restore $key
-- movups $inout0,-16($out) # store output
-+ movups $inout0,0($out) # store output
-+ lea 16($out),$out
-+ sub \$16,$len
- jnc .Lcbc_enc_loop
- add \$16,$len
- jnz .Lcbc_enc_tail
-@@ -429,92 +874,238 @@ $code.=<<___ if ($win64);
- ___
- $code.=<<___;
- movups ($ivp),$iv
-- sub \$0x40,$len
- mov $rnds_,$rounds
-+ cmp \$0x70,$len
- jbe .Lcbc_dec_tail
-- jmp .Lcbc_dec_loop3
--.align 16
--.Lcbc_dec_loop3:
-- movups ($inp),$inout0
-+ shr \$1,$rnds_
-+ sub \$0x70,$len
-+ mov $rnds_,$rounds
-+ movaps $iv,$reserved(%rsp)
-+ jmp .Lcbc_dec_loop8_enter
-+.align 16
-+.Lcbc_dec_loop8:
-+ movaps $rndkey0,$reserved(%rsp) # save IV
-+ movups $inout7,($out)
-+ lea 0x10($out),$out
-+.Lcbc_dec_loop8_enter:
-+ $movkey ($key),$rndkey0
-+ movups ($inp),$inout0 # load input
- movups 0x10($inp),$inout1
-- movups 0x20($inp),$inout2
-- movaps $inout0,$in0
-- movaps $inout1,$in1
-- movaps $inout2,$in2
-- call _aesni_decrypt3
-- sub \$0x30,$len
-- lea 0x30($inp),$inp
-- lea 0x30($out),$out
-- pxor $iv,$inout0
-- pxor $in0,$inout1
-- movaps $in2,$iv
-- pxor $in1,$inout2
-- movups $inout0,-0x30($out)
-- mov $rnds_,$rounds # restore $rounds
-- movups $inout1,-0x20($out)
-- mov $key_,$key # restore $key
-- movups $inout2,-0x10($out)
-- ja .Lcbc_dec_loop3
-+ $movkey 16($key),$rndkey1
+ &cmp (&wparam(5),0);
+- &movups ($ivec,&QWP(0,$key_)); # load IV
++ &xchg ($rounds_,"esp"); # alloca
++ &movups ($ivec,&QWP(0,$key_)); # load IV
+ &mov ($rounds,&DWP(240,$key));
+- &mov ($key_,$key); # backup $key
+- &mov ($rounds_,$rounds); # backup $rounds
++ &mov ($key_,$key); # backup $key
++ &mov (&DWP(16,"esp"),$rounds_); # save original %esp
++ &mov ($rounds_,$rounds); # backup $rounds
+ &je (&label("cbc_decrypt"));
--.Lcbc_dec_tail:
-- add \$0x40,$len
-- movups $iv,($ivp)
-- jz .Lcbc_dec_ret
-+ lea 32($key),$key
-+ movdqu 0x20($inp),$inout2
-+ xorps $rndkey0,$inout0
-+ movdqu 0x30($inp),$inout3
-+ xorps $rndkey0,$inout1
-+ movdqu 0x40($inp),$inout4
-+ aesdec $rndkey1,$inout0
-+ pxor $rndkey0,$inout2
-+ movdqu 0x50($inp),$inout5
-+ aesdec $rndkey1,$inout1
-+ pxor $rndkey0,$inout3
-+ movdqu 0x60($inp),$inout6
-+ aesdec $rndkey1,$inout2
-+ pxor $rndkey0,$inout4
-+ movdqu 0x70($inp),$inout7
-+ aesdec $rndkey1,$inout3
-+ pxor $rndkey0,$inout5
-+ dec $rounds
-+ aesdec $rndkey1,$inout4
-+ pxor $rndkey0,$inout6
-+ aesdec $rndkey1,$inout5
-+ pxor $rndkey0,$inout7
-+ $movkey ($key),$rndkey0
-+ aesdec $rndkey1,$inout6
-+ aesdec $rndkey1,$inout7
-+ $movkey 16($key),$rndkey1
-+
-+ call .Ldec_loop8_enter
+ &movaps ($inout0,$ivec);
+@@ -388,15 +598,17 @@ if ($PREFIX eq "aesni") {
+ &jmp (&label("cbc_enc_loop"));
-+ movups ($inp),$rndkey1 # re-load input
-+ movups 0x10($inp),$rndkey0
-+ xorps $reserved(%rsp),$inout0 # ^= IV
-+ xorps $rndkey1,$inout1
-+ movups 0x20($inp),$rndkey1
-+ xorps $rndkey0,$inout2
-+ movups 0x30($inp),$rndkey0
-+ xorps $rndkey1,$inout3
-+ movups 0x40($inp),$rndkey1
-+ xorps $rndkey0,$inout4
-+ movups 0x50($inp),$rndkey0
-+ xorps $rndkey1,$inout5
-+ movups 0x60($inp),$rndkey1
-+ xorps $rndkey0,$inout6
-+ movups 0x70($inp),$rndkey0 # IV
-+ xorps $rndkey1,$inout7
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ mov $rnds_,$rounds # restore $rounds
-+ movups $inout4,0x40($out)
-+ mov $key_,$key # restore $key
-+ movups $inout5,0x50($out)
-+ lea 0x80($inp),$inp
-+ movups $inout6,0x60($out)
-+ lea 0x70($out),$out
-+ sub \$0x80,$len
-+ ja .Lcbc_dec_loop8
+ &set_label("cbc_enc_loop",16);
+- &movups ($ivec,&QWP(0,$inp));
++ &movups ($ivec,&QWP(0,$inp)); # input actually
+ &lea ($inp,&DWP(16,$inp));
+- &pxor ($inout0,$ivec);
+- &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3");
+- &sub ($len,16);
+- &lea ($out,&DWP(16,$out));
++ if ($inline)
++ { &aesni_inline_generate1("enc",$inout0,$ivec); }
++ else
++ { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
+ &mov ($rounds,$rounds_); # restore $rounds
+ &mov ($key,$key_); # restore $key
+- &movups (&QWP(-16,$out),$inout0);
++ &movups (&QWP(0,$out),$inout0); # store output
++ &lea ($out,&DWP(16,$out));
++ &sub ($len,16);
+ &jnc (&label("cbc_enc_loop"));
+ &add ($len,16);
+ &jnz (&label("cbc_enc_tail"));
+@@ -415,90 +627,151 @@ if ($PREFIX eq "aesni") {
+ &mov ($inp,$out); # $inp and $out are the same
+ &mov ($key,$key_); # restore $key
+ &jmp (&label("cbc_enc_loop"));
+-
++######################################################################
+ &set_label("cbc_decrypt",16);
+- &sub ($len,0x40);
++ &cmp ($len,0x50);
+ &jbe (&label("cbc_dec_tail"));
+- &jmp (&label("cbc_dec_loop3"));
++ &movaps (&QWP(0,"esp"),$ivec); # save IV
++ &sub ($len,0x50);
++ &jmp (&label("cbc_dec_loop6_enter"));
+
-+ movaps $inout7,$inout0
-+ movaps $rndkey0,$iv
-+ add \$0x70,$len
-+ jle .Lcbc_dec_tail_collected
-+ movups $inout0,($out)
-+ lea 1($rnds_,$rnds_),$rounds
-+ lea 0x10($out),$out
-+.Lcbc_dec_tail:
- movups ($inp),$inout0
-- cmp \$0x10,$len
- movaps $inout0,$in0
-+ cmp \$0x10,$len
- jbe .Lcbc_dec_one
++&set_label("cbc_dec_loop6",16);
++ &movaps (&QWP(0,"esp"),$rndkey0); # save IV
++ &movups (&QWP(0,$out),$inout5);
++ &lea ($out,&DWP(0x10,$out));
++&set_label("cbc_dec_loop6_enter");
++ &movdqu ($inout0,&QWP(0,$inp));
++ &movdqu ($inout1,&QWP(0x10,$inp));
++ &movdqu ($inout2,&QWP(0x20,$inp));
++ &movdqu ($inout3,&QWP(0x30,$inp));
++ &movdqu ($inout4,&QWP(0x40,$inp));
++ &movdqu ($inout5,&QWP(0x50,$inp));
+
+-&set_label("cbc_dec_loop3",16);
+- &movups ($inout0,&QWP(0,$inp));
+- &movups ($inout1,&QWP(0x10,$inp));
+- &movups ($inout2,&QWP(0x20,$inp));
+- &movaps ($in0,$inout0);
+- &movaps ($in1,$inout1);
+- &call ("_aesni_decrypt3");
+- &sub ($len,0x30);
+- &lea ($inp,&DWP(0x30,$inp));
+- &lea ($out,&DWP(0x30,$out));
+- &pxor ($inout0,$ivec);
+- &pxor ($inout1,$in0);
+- &movups ($ivec,&QWP(-0x10,$inp));
+- &pxor ($inout2,$in1);
+- &movups (&QWP(-0x30,$out),$inout0);
+- &mov ($rounds,$rounds_) # restore $rounds
+- &movups (&QWP(-0x20,$out),$inout1);
+- &mov ($key,$key_); # restore $key
+- &movups (&QWP(-0x10,$out),$inout2);
+- &ja (&label("cbc_dec_loop3"));
++ &call ("_aesni_decrypt6");
+
++ &movups ($rndkey1,&QWP(0,$inp));
++ &movups ($rndkey0,&QWP(0x10,$inp));
++ &xorps ($inout0,&QWP(0,"esp")); # ^=IV
++ &xorps ($inout1,$rndkey1);
++ &movups ($rndkey1,&QWP(0x20,$inp));
++ &xorps ($inout2,$rndkey0);
++ &movups ($rndkey0,&QWP(0x30,$inp));
++ &xorps ($inout3,$rndkey1);
++ &movups ($rndkey1,&QWP(0x40,$inp));
++ &xorps ($inout4,$rndkey0);
++ &movups ($rndkey0,&QWP(0x50,$inp)); # IV
++ &xorps ($inout5,$rndkey1);
++ &movups (&QWP(0,$out),$inout0);
++ &movups (&QWP(0x10,$out),$inout1);
++ &lea ($inp,&DWP(0x60,$inp));
++ &movups (&QWP(0x20,$out),$inout2);
++ &mov ($rounds,$rounds_) # restore $rounds
++ &movups (&QWP(0x30,$out),$inout3);
++ &mov ($key,$key_); # restore $key
++ &movups (&QWP(0x40,$out),$inout4);
++ &lea ($out,&DWP(0x50,$out));
++ &sub ($len,0x60);
++ &ja (&label("cbc_dec_loop6"));
+
- movups 0x10($inp),$inout1
-- cmp \$0x20,$len
- movaps $inout1,$in1
-+ cmp \$0x20,$len
- jbe .Lcbc_dec_two
++ &movaps ($inout0,$inout5);
++ &movaps ($ivec,$rndkey0);
++ &add ($len,0x50);
++ &jle (&label("cbc_dec_tail_collected"));
++ &movups (&QWP(0,$out),$inout0);
++ &lea ($out,&DWP(0x10,$out));
+ &set_label("cbc_dec_tail");
+- &add ($len,0x40);
+- &jz (&label("cbc_ret"));
+-
+ &movups ($inout0,&QWP(0,$inp));
+- &cmp ($len,0x10);
+ &movaps ($in0,$inout0);
++ &cmp ($len,0x10);
+ &jbe (&label("cbc_dec_one"));
+
- movups 0x20($inp),$inout2
-- cmp \$0x30,$len
- movaps $inout2,$in2
-+ cmp \$0x30,$len
- jbe .Lcbc_dec_three
+ &movups ($inout1,&QWP(0x10,$inp));
+- &cmp ($len,0x20);
+ &movaps ($in1,$inout1);
++ &cmp ($len,0x20);
+ &jbe (&label("cbc_dec_two"));
+
- movups 0x30($inp),$inout3
-- call _aesni_decrypt4
-- pxor $iv,$inout0
-- movups 0x30($inp),$iv
-- pxor $in0,$inout1
-+ cmp \$0x40,$len
-+ jbe .Lcbc_dec_four
+ &movups ($inout2,&QWP(0x20,$inp));
+ &cmp ($len,0x30);
+ &jbe (&label("cbc_dec_three"));
+
-+ movups 0x40($inp),$inout4
-+ cmp \$0x50,$len
-+ jbe .Lcbc_dec_five
+ &movups ($inout3,&QWP(0x30,$inp));
+- &call ("_aesni_decrypt4");
++ &cmp ($len,0x40);
++ &jbe (&label("cbc_dec_four"));
+
-+ movups 0x50($inp),$inout5
-+ cmp \$0x60,$len
-+ jbe .Lcbc_dec_six
++ &movups ($inout4,&QWP(0x40,$inp));
++ &movaps (&QWP(0,"esp"),$ivec); # save IV
++ &movups ($inout0,&QWP(0,$inp));
++ &xorps ($inout5,$inout5);
++ &call ("_aesni_decrypt6");
++ &movups ($rndkey1,&QWP(0,$inp));
+ &movups ($rndkey0,&QWP(0x10,$inp));
++ &xorps ($inout0,&QWP(0,"esp")); # ^= IV
++ &xorps ($inout1,$rndkey1);
+ &movups ($rndkey1,&QWP(0x20,$inp));
+- &pxor ($inout0,$ivec);
+- &pxor ($inout1,$in0);
+- &movups ($ivec,&QWP(0x30,$inp));
++ &xorps ($inout2,$rndkey0);
++ &movups ($rndkey0,&QWP(0x30,$inp));
++ &xorps ($inout3,$rndkey1);
++ &movups ($ivec,&QWP(0x40,$inp)); # IV
++ &xorps ($inout4,$rndkey0);
+ &movups (&QWP(0,$out),$inout0);
+- &pxor ($inout2,$rndkey0);
+- &pxor ($inout3,$rndkey1);
+ &movups (&QWP(0x10,$out),$inout1);
+ &movups (&QWP(0x20,$out),$inout2);
+- &movaps ($inout0,$inout3);
+- &lea ($out,&DWP(0x30,$out));
++ &movups (&QWP(0x30,$out),$inout3);
++ &lea ($out,&DWP(0x40,$out));
++ &movaps ($inout0,$inout4);
++ &sub ($len,0x50);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+-&set_label("cbc_dec_one");
+- &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
+- &pxor ($inout0,$ivec);
++&set_label("cbc_dec_one",16);
++ if ($inline)
++ { &aesni_inline_generate1("dec"); }
++ else
++ { &call ("_aesni_decrypt1"); }
++ &xorps ($inout0,$ivec);
+ &movaps ($ivec,$in0);
++ &sub ($len,0x10);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+-&set_label("cbc_dec_two");
++&set_label("cbc_dec_two",16);
++ &xorps ($inout2,$inout2);
+ &call ("_aesni_decrypt3");
+- &pxor ($inout0,$ivec);
+- &pxor ($inout1,$in0);
++ &xorps ($inout0,$ivec);
++ &xorps ($inout1,$in0);
+ &movups (&QWP(0,$out),$inout0);
+ &movaps ($inout0,$inout1);
+- &movaps ($ivec,$in1);
+ &lea ($out,&DWP(0x10,$out));
++ &movaps ($ivec,$in1);
++ &sub ($len,0x20);
+ &jmp (&label("cbc_dec_tail_collected"));
+
+-&set_label("cbc_dec_three");
++&set_label("cbc_dec_three",16);
+ &call ("_aesni_decrypt3");
+- &pxor ($inout0,$ivec);
+- &pxor ($inout1,$in0);
+- &pxor ($inout2,$in1);
++ &xorps ($inout0,$ivec);
++ &xorps ($inout1,$in0);
++ &xorps ($inout2,$in1);
+ &movups (&QWP(0,$out),$inout0);
+- &movups (&QWP(0x10,$out),$inout1);
+ &movaps ($inout0,$inout2);
+- &movups ($ivec,&QWP(0x20,$inp));
++ &movups (&QWP(0x10,$out),$inout1);
+ &lea ($out,&DWP(0x20,$out));
++ &movups ($ivec,&QWP(0x20,$inp));
++ &sub ($len,0x30);
++ &jmp (&label("cbc_dec_tail_collected"));
+
-+ movups 0x60($inp),$inout6
-+ movaps $iv,$reserved(%rsp) # save IV
-+ call _aesni_decrypt8
-+ movups ($inp),$rndkey1
-+ movups 0x10($inp),$rndkey0
-+ xorps $reserved(%rsp),$inout0 # ^= IV
-+ xorps $rndkey1,$inout1
-+ movups 0x20($inp),$rndkey1
-+ xorps $rndkey0,$inout2
-+ movups 0x30($inp),$rndkey0
-+ xorps $rndkey1,$inout3
-+ movups 0x40($inp),$rndkey1
-+ xorps $rndkey0,$inout4
-+ movups 0x50($inp),$rndkey0
-+ xorps $rndkey1,$inout5
-+ movups 0x60($inp),$iv # IV
-+ xorps $rndkey0,$inout6
- movups $inout0,($out)
-- pxor $in1,$inout2
- movups $inout1,0x10($out)
-- pxor $in2,$inout3
- movups $inout2,0x20($out)
-- movaps $inout3,$inout0
-- lea 0x30($out),$out
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ movups $inout5,0x50($out)
-+ lea 0x60($out),$out
-+ movaps $inout6,$inout0
-+ sub \$0x70,$len
- jmp .Lcbc_dec_tail_collected
- .align 16
- .Lcbc_dec_one:
- ___
- &aesni_generate1("dec",$key,$rounds);
- $code.=<<___;
-- pxor $iv,$inout0
-+ xorps $iv,$inout0
- movaps $in0,$iv
-+ sub \$0x10,$len
- jmp .Lcbc_dec_tail_collected
- .align 16
- .Lcbc_dec_two:
-+ xorps $inout2,$inout2
- call _aesni_decrypt3
-- pxor $iv,$inout0
-- pxor $in0,$inout1
-+ xorps $iv,$inout0
-+ xorps $in0,$inout1
- movups $inout0,($out)
- movaps $in1,$iv
- movaps $inout1,$inout0
- lea 0x10($out),$out
-+ sub \$0x20,$len
- jmp .Lcbc_dec_tail_collected
- .align 16
- .Lcbc_dec_three:
- call _aesni_decrypt3
-- pxor $iv,$inout0
-- pxor $in0,$inout1
-+ xorps $iv,$inout0
-+ xorps $in0,$inout1
- movups $inout0,($out)
-- pxor $in1,$inout2
-+ xorps $in1,$inout2
- movups $inout1,0x10($out)
- movaps $in2,$iv
- movaps $inout2,$inout0
- lea 0x20($out),$out
-+ sub \$0x30,$len
-+ jmp .Lcbc_dec_tail_collected
-+.align 16
-+.Lcbc_dec_four:
-+ call _aesni_decrypt4
-+ xorps $iv,$inout0
-+ movups 0x30($inp),$iv
-+ xorps $in0,$inout1
-+ movups $inout0,($out)
-+ xorps $in1,$inout2
-+ movups $inout1,0x10($out)
-+ xorps $in2,$inout3
-+ movups $inout2,0x20($out)
-+ movaps $inout3,$inout0
-+ lea 0x30($out),$out
-+ sub \$0x40,$len
-+ jmp .Lcbc_dec_tail_collected
-+.align 16
-+.Lcbc_dec_five:
-+ xorps $inout5,$inout5
-+ call _aesni_decrypt6
-+ movups 0x10($inp),$rndkey1
-+ movups 0x20($inp),$rndkey0
-+ xorps $iv,$inout0
-+ xorps $in0,$inout1
-+ xorps $rndkey1,$inout2
-+ movups 0x30($inp),$rndkey1
-+ xorps $rndkey0,$inout3
-+ movups 0x40($inp),$iv
-+ xorps $rndkey1,$inout4
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ lea 0x40($out),$out
-+ movaps $inout4,$inout0
-+ sub \$0x50,$len
-+ jmp .Lcbc_dec_tail_collected
-+.align 16
-+.Lcbc_dec_six:
-+ call _aesni_decrypt6
-+ movups 0x10($inp),$rndkey1
-+ movups 0x20($inp),$rndkey0
-+ xorps $iv,$inout0
-+ xorps $in0,$inout1
-+ xorps $rndkey1,$inout2
-+ movups 0x30($inp),$rndkey1
-+ xorps $rndkey0,$inout3
-+ movups 0x40($inp),$rndkey0
-+ xorps $rndkey1,$inout4
-+ movups 0x50($inp),$iv
-+ xorps $rndkey0,$inout5
-+ movups $inout0,($out)
-+ movups $inout1,0x10($out)
-+ movups $inout2,0x20($out)
-+ movups $inout3,0x30($out)
-+ movups $inout4,0x40($out)
-+ lea 0x50($out),$out
-+ movaps $inout5,$inout0
-+ sub \$0x60,$len
- jmp .Lcbc_dec_tail_collected
- .align 16
- .Lcbc_dec_tail_collected:
-@@ -523,10 +1114,12 @@ $code.=<<___;
- jnz .Lcbc_dec_tail_partial
- movups $inout0,($out)
- jmp .Lcbc_dec_ret
-+.align 16
- .Lcbc_dec_tail_partial:
- movaps $inout0,$reserved(%rsp)
-+ mov \$16,%rcx
- mov $out,%rdi
-- mov $len,%rcx
-+ sub $len,%rcx
- lea $reserved(%rsp),%rsi
- .long 0x9066A4F3 # rep movsb
++&set_label("cbc_dec_four",16);
++ &call ("_aesni_decrypt4");
++ &movups ($rndkey1,&QWP(0x10,$inp));
++ &movups ($rndkey0,&QWP(0x20,$inp));
++ &xorps ($inout0,$ivec);
++ &movups ($ivec,&QWP(0x30,$inp));
++ &xorps ($inout1,$in0);
++ &movups (&QWP(0,$out),$inout0);
++ &xorps ($inout2,$rndkey1);
++ &movups (&QWP(0x10,$out),$inout1);
++ &xorps ($inout3,$rndkey0);
++ &movups (&QWP(0x20,$out),$inout2);
++ &lea ($out,&DWP(0x30,$out));
++ &movaps ($inout0,$inout3);
++ &sub ($len,0x40);
-@@ -544,7 +1137,7 @@ $code.=<<___;
- ret
- .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
- ___
--
-+}
- # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
- # int bits, AES_KEY *key)
- { my ($inp,$bits,$key) = @_4args;
-@@ -556,7 +1149,7 @@ $code.=<<___;
- .align 16
- ${PREFIX}_set_decrypt_key:
- .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
-- call _aesni_set_encrypt_key
-+ call __aesni_set_encrypt_key
- shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
- test %eax,%eax
- jnz .Ldec_key_ret
-@@ -576,9 +1169,9 @@ ${PREFIX}_set_decrypt_key:
- aesimc %xmm1,%xmm1
- lea 16($key),$key
- lea -16($inp),$inp
-- cmp $key,$inp
- $movkey %xmm0,16($inp)
- $movkey %xmm1,-16($key)
-+ cmp $key,$inp
- ja .Ldec_key_inverse
+ &set_label("cbc_dec_tail_collected");
+ &and ($len,15);
+@@ -506,21 +779,21 @@ if ($PREFIX eq "aesni") {
+ &movups (&QWP(0,$out),$inout0);
+ &jmp (&label("cbc_ret"));
- $movkey ($key),%xmm0 # inverse middle
-@@ -605,16 +1198,16 @@ $code.=<<___;
- .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
- .align 16
- ${PREFIX}_set_encrypt_key:
--_aesni_set_encrypt_key:
-+__aesni_set_encrypt_key:
- .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
-- test $inp,$inp
- mov \$-1,%rax
-+ test $inp,$inp
- jz .Lenc_key_ret
- test $key,$key
- jz .Lenc_key_ret
+-&set_label("cbc_dec_tail_partial");
+- &mov ($key_,"esp");
+- &sub ("esp",16);
+- &and ("esp",-16);
++&set_label("cbc_dec_tail_partial",16);
+ &movaps (&QWP(0,"esp"),$inout0);
++ &mov ("ecx",16);
+ &mov ($inp,"esp");
+- &mov ("ecx",$len);
++ &sub ("ecx",$len);
+ &data_word(0xA4F3F689); # rep movsb
+- &mov ("esp",$key_);
+
+ &set_label("cbc_ret");
++ &mov ("esp",&DWP(16,"esp")); # pull original %esp
+ &mov ($key_,&wparam(4));
+ &movups (&QWP(0,$key_),$ivec); # output IV
++&set_label("cbc_abort");
+ &function_end("${PREFIX}_cbc_encrypt");
+-
++
++######################################################################
+ # Mechanical port from aesni-x86_64.pl.
+ #
+ # _aesni_set_encrypt_key is private interface,
+@@ -539,7 +812,7 @@ if ($PREFIX eq "aesni") {
+ &jz (&label("bad_pointer"));
+
+ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
+- &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
++ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
+ &lea ($key,&DWP(16,$key));
+ &cmp ($rounds,256);
+ &je (&label("14rounds"));
+@@ -581,11 +854,11 @@ if ($PREFIX eq "aesni") {
+ &lea ($key,&DWP(16,$key));
+ &set_label("key_128_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+- &pxor ("xmm0","xmm4");
+- &shufps ("xmm4","xmm0",0b10001100,);
+- &pxor ("xmm0","xmm4");
+- &pshufd ("xmm1","xmm1",0b11111111); # critical path
+- &pxor ("xmm0","xmm1");
++ &xorps ("xmm0","xmm4");
++ &shufps ("xmm4","xmm0",0b10001100);
++ &xorps ("xmm0","xmm4");
++ &shufps ("xmm1","xmm1",0b11111111); # critical path
++ &xorps ("xmm0","xmm1");
+ &ret();
- movups ($inp),%xmm0 # pull first 128 bits of *userKey
-- pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0
-+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
- lea 16($key),%rax
- cmp \$256,$bits
- je .L14rounds
-@@ -729,11 +1322,11 @@ _aesni_set_encrypt_key:
- lea 16(%rax),%rax
- .Lkey_expansion_128_cold:
- shufps \$0b00010000,%xmm0,%xmm4
-- pxor %xmm4, %xmm0
-+ xorps %xmm4, %xmm0
- shufps \$0b10001100,%xmm0,%xmm4
-- pxor %xmm4, %xmm0
-- pshufd \$0b11111111,%xmm1,%xmm1 # critical path
-- pxor %xmm1,%xmm0
-+ xorps %xmm4, %xmm0
-+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
-+ xorps %xmm1,%xmm0
- ret
+ &set_label("12rounds",16);
+@@ -620,11 +893,11 @@ if ($PREFIX eq "aesni") {
+ &movaps ("xmm5","xmm2");
+ &set_label("key_192b_warm");
+ &shufps ("xmm4","xmm0",0b00010000);
+- &movaps ("xmm3","xmm2");
+- &pxor ("xmm0","xmm4");
++ &movdqa ("xmm3","xmm2");
++ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+ &pslldq ("xmm3",4);
+- &pxor ("xmm0","xmm4");
++ &xorps ("xmm0","xmm4");
+ &pshufd ("xmm1","xmm1",0b01010101); # critical path
+ &pxor ("xmm2","xmm3");
+ &pxor ("xmm0","xmm1");
+@@ -683,11 +956,11 @@ if ($PREFIX eq "aesni") {
+ &lea ($key,&DWP(16,$key));
+ &set_label("key_256a_cold");
+ &shufps ("xmm4","xmm0",0b00010000);
+- &pxor ("xmm0","xmm4");
++ &xorps ("xmm0","xmm4");
+ &shufps ("xmm4","xmm0",0b10001100);
+- &pxor ("xmm0","xmm4");
+- &pshufd ("xmm1","xmm1",0b11111111); # critical path
+- &pxor ("xmm0","xmm1");
++ &xorps ("xmm0","xmm4");
++ &shufps ("xmm1","xmm1",0b11111111); # critical path
++ &xorps ("xmm0","xmm1");
+ &ret();
- .align 16
-@@ -744,11 +1337,11 @@ _aesni_set_encrypt_key:
- movaps %xmm2, %xmm5
- .Lkey_expansion_192b_warm:
- shufps \$0b00010000,%xmm0,%xmm4
-- movaps %xmm2,%xmm3
-- pxor %xmm4,%xmm0
-+ movdqa %xmm2,%xmm3
-+ xorps %xmm4,%xmm0
- shufps \$0b10001100,%xmm0,%xmm4
- pslldq \$4,%xmm3
-- pxor %xmm4,%xmm0
-+ xorps %xmm4,%xmm0
- pshufd \$0b01010101,%xmm1,%xmm1 # critical path
- pxor %xmm3,%xmm2
- pxor %xmm1,%xmm0
-@@ -772,11 +1365,11 @@ _aesni_set_encrypt_key:
- lea 16(%rax),%rax
- .Lkey_expansion_256a_cold:
- shufps \$0b00010000,%xmm0,%xmm4
-- pxor %xmm4,%xmm0
-+ xorps %xmm4,%xmm0
- shufps \$0b10001100,%xmm0,%xmm4
-- pxor %xmm4,%xmm0
-- pshufd \$0b11111111,%xmm1,%xmm1 # critical path
-- pxor %xmm1,%xmm0
-+ xorps %xmm4,%xmm0
-+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
-+ xorps %xmm1,%xmm0
- ret
+ &set_label("key_256b",16);
+@@ -695,11 +968,11 @@ if ($PREFIX eq "aesni") {
+ &lea ($key,&DWP(16,$key));
- .align 16
-@@ -785,17 +1378,28 @@ _aesni_set_encrypt_key:
- lea 16(%rax),%rax
+ &shufps ("xmm4","xmm2",0b00010000);
+- &pxor ("xmm2","xmm4");
++ &xorps ("xmm2","xmm4");
+ &shufps ("xmm4","xmm2",0b10001100);
+- &pxor ("xmm2","xmm4");
+- &pshufd ("xmm1","xmm1",0b10101010); # critical path
+- &pxor ("xmm2","xmm1");
++ &xorps ("xmm2","xmm4");
++ &shufps ("xmm1","xmm1",0b10101010); # critical path
++ &xorps ("xmm2","xmm1");
+ &ret();
- shufps \$0b00010000,%xmm2,%xmm4
-- pxor %xmm4,%xmm2
-+ xorps %xmm4,%xmm2
- shufps \$0b10001100,%xmm2,%xmm4
-- pxor %xmm4,%xmm2
-- pshufd \$0b10101010,%xmm1,%xmm1 # critical path
-- pxor %xmm1,%xmm2
-+ xorps %xmm4,%xmm2
-+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
-+ xorps %xmm1,%xmm2
- ret
- .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
-+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
- ___
- }
-
- $code.=<<___;
-+.align 64
-+.Lbswap_mask:
-+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-+.Lincrement32:
-+ .long 6,6,6,0
-+.Lincrement64:
-+ .long 1,0,0,0
-+.Lxts_magic:
-+ .long 0x87,0,1,0
-+
- .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
- .align 64
- ___
-diff -up openssl-1.0.0d/crypto/cryptlib.c.intelopts openssl-1.0.0d/crypto/cryptlib.c
---- openssl-1.0.0d/crypto/cryptlib.c.intelopts 2010-11-19 01:11:27.000000000 +0100
-+++ openssl-1.0.0d/crypto/cryptlib.c 2011-08-24 12:50:55.000000000 +0200
+ &set_label("bad_pointer",4);
+@@ -747,9 +1020,9 @@ if ($PREFIX eq "aesni") {
+ &aesimc ("xmm1","xmm1");
+ &lea ($key,&DWP(16,$key));
+ &lea ("eax",&DWP(-16,"eax"));
+- &cmp ("eax",$key);
+ &$movekey (&QWP(16,"eax"),"xmm0");
+ &$movekey (&QWP(-16,$key),"xmm1");
++ &cmp ("eax",$key);
+ &ja (&label("dec_key_inverse"));
+
+ &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
+diff -up openssl-1.0.0k/crypto/cryptlib.c.intelopts openssl-1.0.0k/crypto/cryptlib.c
+--- openssl-1.0.0k/crypto/cryptlib.c.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/cryptlib.c 2013-02-19 21:15:39.596407392 +0100
@@ -662,22 +662,23 @@ const char *CRYPTO_get_lock_name(int typ
defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
@@ -2343,158 +2343,57 @@ diff -up openssl-1.0.0d/crypto/cryptlib.c.intelopts openssl-1.0.0d/crypto/cryptl
/*
* |(1<<10) sets a reserved bit to signal that variable
* was initialized already... This is to avoid interference
-diff -up openssl-1.0.0d/crypto/engine/eng_aesni.c.intelopts openssl-1.0.0d/crypto/engine/eng_aesni.c
---- openssl-1.0.0d/crypto/engine/eng_aesni.c.intelopts 2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/engine/eng_aesni.c 2011-08-24 12:50:55.000000000 +0200
+diff -up openssl-1.0.0k/crypto/engine/eng_aesni.c.intelopts openssl-1.0.0k/crypto/engine/eng_aesni.c
+--- openssl-1.0.0k/crypto/engine/eng_aesni.c.intelopts 2013-02-19 21:15:39.419403774 +0100
++++ openssl-1.0.0k/crypto/engine/eng_aesni.c 2013-02-19 21:15:39.608407632 +0100
@@ -157,16 +157,20 @@ typedef unsigned __int64 IA32CAP;
typedef unsigned long long IA32CAP;
#endif
-+extern IA32CAP OPENSSL_ia32cap_X;
-+
- /* Prepare the ENGINE structure for registration */
- static int
- aesni_bind_helper(ENGINE *e)
- {
- int engage;
-- if (sizeof(OPENSSL_ia32cap_P) > 4) {
-- engage = (OPENSSL_ia32cap_P >> 57) & 1;
-- } else {
-- IA32CAP OPENSSL_ia32_cpuid(void);
-- engage = (OPENSSL_ia32_cpuid() >> 57) & 1;
-+ engage = (OPENSSL_ia32cap_X >> 57) & 1;
-+
-+ /* Disable the AES-NI support if the environment variable
-+ * OPENSSL_DISABLE_AES_NI is set to any value
-+ */
-+ if (getenv("OPENSSL_DISABLE_AES_NI") != NULL) {
-+ engage = 0;
- }
-
- /* Register everything or return with an error */
-diff -up openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c.intelopts openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c
---- openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c.intelopts 2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/fips/fips_standalone_sha1.c 2011-08-24 12:50:55.000000000 +0200
-@@ -62,6 +62,8 @@ void OPENSSL_cleanse(void *p,size_t len)
-
- #ifdef OPENSSL_FIPS
-
-+unsigned long long OPENSSL_ia32cap_X = 0;
-+
- static void hmac_init(SHA256_CTX *md_ctx,SHA256_CTX *o_ctx,
- const char *key)
- {
-diff -up openssl-1.0.0d/crypto/perlasm/x86asm.pl.intelopts openssl-1.0.0d/crypto/perlasm/x86asm.pl
---- openssl-1.0.0d/crypto/perlasm/x86asm.pl.intelopts 2008-12-17 20:56:47.000000000 +0100
-+++ openssl-1.0.0d/crypto/perlasm/x86asm.pl 2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
-
- # require 'x86asm.pl';
- # &asm_init(<flavor>,"des-586.pl"[,$i386only]);
-@@ -80,6 +80,57 @@ sub ::movq
- { &::generic("movq", at _); }
- }
-
-+# SSE>2 instructions
-+my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
-+ "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 );
-+sub ::pextrd
-+{ my($dst,$src,$imm)=@_;
-+ if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
-+ { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); }
-+ else
-+ { &::generic("pextrd", at _); }
-+}
-+
-+sub ::pinsrd
-+{ my($dst,$src,$imm)=@_;
-+ if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
-+ { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); }
-+ else
-+ { &::generic("pinsrd", at _); }
-+}
-+
-+sub ::pshufb
-+{ my($dst,$src)=@_;
-+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+ { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); }
-+ else
-+ { &::generic("pshufb", at _); }
-+}
-+
-+sub ::palignr
-+{ my($dst,$src,$imm)=@_;
-+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+ { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); }
-+ else
-+ { &::generic("palignr", at _); }
-+}
-+
-+sub ::pclmulqdq
-+{ my($dst,$src,$imm)=@_;
-+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
-+ { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); }
-+ else
-+ { &::generic("pclmulqdq", at _); }
-+}
-+
-+sub ::rdrand
-+{ my ($dst)=@_;
-+ if ($dst =~ /(e[a-dsd][ixp])/)
-+ { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); }
-+ else
-+ { &::generic("rdrand", at _); }
-+}
-+
- # label management
- $lbdecor="L"; # local label decoration, set by package
- $label="000";
-diff -up openssl-1.0.0d/crypto/perlasm/x86gas.pl.intelopts openssl-1.0.0d/crypto/perlasm/x86gas.pl
---- openssl-1.0.0d/crypto/perlasm/x86gas.pl.intelopts 2008-12-17 20:56:47.000000000 +0100
-+++ openssl-1.0.0d/crypto/perlasm/x86gas.pl 2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
-
- package x86gas;
-
-@@ -91,6 +91,7 @@ sub ::DWP
- }
- sub ::QWP { &::DWP(@_); }
- sub ::BP { &::DWP(@_); }
-+sub ::WP { &::DWP(@_); }
- sub ::BC { @_; }
- sub ::DWC { @_; }
-
-@@ -161,10 +162,16 @@ sub ::file_end
- { push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n"); }
++extern IA32CAP OPENSSL_ia32cap_X;
++
+ /* Prepare the ENGINE structure for registration */
+ static int
+ aesni_bind_helper(ENGINE *e)
+ {
+ int engage;
+- if (sizeof(OPENSSL_ia32cap_P) > 4) {
+- engage = (OPENSSL_ia32cap_P >> 57) & 1;
+- } else {
+- IA32CAP OPENSSL_ia32_cpuid(void);
+- engage = (OPENSSL_ia32_cpuid() >> 57) & 1;
++ engage = (OPENSSL_ia32cap_X >> 57) & 1;
++
++ /* Disable the AES-NI support if the environment variable
++ * OPENSSL_DISABLE_AES_NI is set to any value
++ */
++ if (getenv("OPENSSL_DISABLE_AES_NI") != NULL) {
++ engage = 0;
}
- }
-+ if (grep {/\b${nmdecor}OPENSSL_ia32cap_X\b/i} @out) {
-+ my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_X,8";
-+ if ($::elf) { push (@out,"$tmp,4\n"); }
-+ else { push (@out,"$tmp\n"); }
-+ }
- push(@out,$initseg) if ($initseg);
- }
- sub ::data_byte { push(@out,".byte\t".join(',', at _)."\n"); }
-+sub ::data_short{ push(@out,".value\t".join(',', at _)."\n"); }
- sub ::data_word { push(@out,".long\t".join(',', at _)."\n"); }
+ /* Register everything or return with an error */
+diff -up openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.intelopts openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c
+--- openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c.intelopts 2013-02-19 21:15:39.373402833 +0100
++++ openssl-1.0.0k/crypto/fips/fips_standalone_sha1.c 2013-02-19 21:15:39.608407632 +0100
+@@ -62,6 +62,8 @@ void OPENSSL_cleanse(void *p,size_t len)
- sub ::align
-diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl
---- openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts 2010-10-10 23:14:17.000000000 +0200
-+++ openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl 2011-08-24 12:50:56.000000000 +0200
+ #ifdef OPENSSL_FIPS
+
++unsigned long long OPENSSL_ia32cap_X = 0;
++
+ static void hmac_init(SHA256_CTX *md_ctx,SHA256_CTX *o_ctx,
+ const char *key)
+ {
+diff -up openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl
+--- openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/perlasm/x86_64-xlate.pl 2013-02-19 21:15:39.619407858 +0100
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/perl
# Ascetic x86_64 AT&T to MASM/NASM assembler translator by <appro>.
#
-@@ -121,7 +121,11 @@ my %globals;
+@@ -117,7 +117,11 @@ my %globals;
$self->{sz} = "b";
} elsif ($self->{op} =~ /call|jmp/) {
$self->{sz} = "";
@@ -2507,7 +2406,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
$self->{sz} = "";
} elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
$self->{op} = $1;
-@@ -246,35 +250,38 @@ my %globals;
+@@ -242,35 +246,38 @@ my %globals;
$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
$self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
@@ -2558,7 +2457,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
} elsif ($self->{base} eq "rip") {
sprintf "%s[%s]",$szmap{$sz},$self->{label};
} else {
-@@ -506,6 +513,11 @@ my %globals;
+@@ -502,6 +509,11 @@ my %globals;
}
} elsif ($dir =~ /\.(text|data)/) {
$current_segment=".$1";
@@ -2570,7 +2469,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
}
$line = "";
return $self;
-@@ -613,6 +625,19 @@ my %globals;
+@@ -610,6 +622,19 @@ my %globals;
.join(",", at str) if (@str);
last;
};
@@ -2590,7 +2489,7 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
}
$line = "";
}
-@@ -625,9 +650,133 @@ my %globals;
+@@ -622,9 +647,133 @@ my %globals;
}
}
@@ -2693,544 +2592,197 @@ diff -up openssl-1.0.0d/crypto/perlasm/x86_64-xlate.pl.intelopts openssl-1.0.0d/
+my $pclmulqdq = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x66);
-+ rex(\@opcode,$3,$2);
-+ push @opcode,0x0f,0x3a,0x44;
-+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
-+ my $c=$1;
-+ push @opcode,$c=~/^0/?oct($c):$c;
-+ @opcode;
-+ } else {
-+ ();
-+ }
-+};
-+
-+my $rdrand = sub {
-+ if (shift =~ /%[er](\w+)/) {
-+ my @opcode=();
-+ my $dst=$1;
-+ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-+ rex(\@opcode,0,$1,8);
-+ push @opcode,0x0f,0xc7,0xf0|($dst&7);
-+ @opcode;
-+ } else {
-+ ();
-+ }
-+};
-+
- if ($nasm) {
- print <<___;
- default rel
-+%define XMMWORD
- ___
- } elsif ($masm) {
- print <<___;
-@@ -644,14 +793,22 @@ while($line=<>) {
-
- undef $label;
- undef $opcode;
-- undef $sz;
- undef @args;
-
- if ($label=label->re(\$line)) { print $label->out(); }
-
- if (directive->re(\$line)) {
- printf "%s",directive->out();
-- } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
-+ } elsif ($opcode=opcode->re(\$line)) {
-+ my $asm = eval("\$".$opcode->mnemonic());
-+ undef @bytes;
-+
-+ if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
-+ print $gas?".byte\t":"DB\t",join(',', at bytes),"\n";
-+ next;
-+ }
-+
-+ ARGUMENT: while (1) {
- my $arg;
-
- if ($arg=register->re(\$line)) { opcode->size($arg->size()); }
-@@ -667,19 +824,26 @@ while($line=<>) {
- $line =~ s/^,\s*//;
- } # ARGUMENT:
-
-- $sz=opcode->size();
--
- if ($#args>=0) {
- my $insn;
-+ my $sz=opcode->size();
-+
- if ($gas) {
- $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
-+ @args = map($_->out($sz), at args);
-+ printf "\t%s\t%s",$insn,join(",", at args);
- } else {
- $insn = $opcode->out();
-- $insn .= $sz if (map($_->out() =~ /x?mm/, at args));
-+ foreach (@args) {
-+ my $arg = $_->out();
-+ # $insn.=$sz compensates for movq, pinsrw, ...
-+ if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
-+ if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; }
-+ }
- @args = reverse(@args);
- undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
-+ printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
- }
-- printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
- } else {
- printf "\t%s",$opcode->out();
- }
-diff -up openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl.intelopts openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl
---- openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl.intelopts 2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/rc4/asm/rc4-x86_64.pl 2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
- #
- # ====================================================================
- # Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -7,6 +7,8 @@
- # details see http://www.openssl.org/~appro/cryptogams/.
- # ====================================================================
- #
-+# July 2004
-+#
- # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
- # "hand-coded assembler"] doesn't stand for the whole improvement
- # coefficient. It turned out that eliminating RC4_CHAR from config
-@@ -19,6 +21,8 @@
- # to operate on partial registers, it turned out to be the best bet.
- # At least for AMD... How IA32E would perform remains to be seen...
-
-+# November 2004
-+#
- # As was shown by Marc Bevand reordering of couple of load operations
- # results in even higher performance gain of 3.3x:-) At least on
- # Opteron... For reference, 1x in this case is RC4_CHAR C-code
-@@ -26,6 +30,8 @@
- # Latter means that if you want to *estimate* what to expect from
- # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
-
-+# November 2004
-+#
- # Intel P4 EM64T core was found to run the AMD64 code really slow...
- # The only way to achieve comparable performance on P4 was to keep
- # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
-@@ -33,10 +39,14 @@
- # on either AMD and Intel platforms, I implement both cases. See
- # rc4_skey.c for further details...
-
-+# April 2005
-+#
- # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
- # those with add/sub results in 50% performance improvement of folded
- # loop...
-
-+# May 2005
-+#
- # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
- # performance by >30% [unlike P4 32-bit case that is]. But this is
- # provided that loads are reordered even more aggressively! Both code
-@@ -50,6 +60,8 @@
- # is not implemented, then this final RC4_CHAR code-path should be
- # preferred, as it provides better *all-round* performance].
-
-+# March 2007
-+#
- # Intel Core2 was observed to perform poorly on both code paths:-( It
- # apparently suffers from some kind of partial register stall, which
- # occurs in 64-bit mode only [as virtually identical 32-bit loop was
-@@ -58,6 +70,34 @@
- # fit for Core2 and therefore the code was modified to skip cloop8 on
- # this CPU.
-
-+# May 2010
-+#
-+# Intel Westmere was observed to perform suboptimally. Adding yet
-+# another movzb to cloop1 improved performance by almost 50%! Core2
-+# performance is improved too, but nominally...
-+
-+# May 2011
-+#
-+# The only code path that was not modified is P4-specific one. Non-P4
-+# Intel code path optimization is heavily based on submission by Maxim
-+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
-+# some of the ideas even in attempt to optmize the original RC4_INT
-+# code path... Current performance in cycles per processed byte (less
-+# is better) and improvement coefficients relative to previous
-+# version of this module are:
-+#
-+# Opteron 5.3/+0%
-+# P4 6.5
-+# Core2 6.2/+15%(*)
-+# Westmere 4.2/+60%
-+# Sandy Bridge 4.2/+120%
-+# Atom 9.3/+80%
-+#
-+# (*) Note that Core2 result is ~15% lower than corresponding result
-+# for 32-bit code, meaning that it's possible to improve it,
-+# but more than likely at the cost of the others (see rc4-586.pl
-+# to get the idea)...
-+
- $flavour = shift;
- $output = shift;
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-@@ -76,13 +116,10 @@ $len="%rsi"; # arg2
- $inp="%rdx"; # arg3
- $out="%rcx"; # arg4
-
-- at XX=("%r8","%r10");
-- at TX=("%r9","%r11");
--$YY="%r12";
--$TY="%r13";
--
-+{
- $code=<<___;
- .text
-+.extern OPENSSL_ia32cap_P
-
- .globl RC4
- .type RC4,\@function,4
-@@ -95,48 +132,173 @@ RC4: or $len,$len
- push %r12
- push %r13
- .Lprologue:
-+ mov $len,%r11
-+ mov $inp,%r12
-+ mov $out,%r13
-+___
-+my $len="%r11"; # reassign input arguments
-+my $inp="%r12";
-+my $out="%r13";
-+
-+my @XX=("%r10","%rsi");
-+my @TX=("%rax","%rbx");
-+my $YY="%rcx";
-+my $TY="%rdx";
-
-- add \$8,$dat
-- movl -8($dat),$XX[0]#d
-- movl -4($dat),$YY#d
-+$code.=<<___;
-+ xor $XX[0],$XX[0]
-+ xor $YY,$YY
++ rex(\@opcode,$3,$2);
++ push @opcode,0x0f,0x3a,0x44;
++ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
++ my $c=$1;
++ push @opcode,$c=~/^0/?oct($c):$c;
++ @opcode;
++ } else {
++ ();
++ }
++};
+
-+ lea 8($dat),$dat
-+ mov -8($dat),$XX[0]#b
-+ mov -4($dat),$YY#b
- cmpl \$-1,256($dat)
- je .LRC4_CHAR
-+ mov OPENSSL_ia32cap_P(%rip),%r8d
-+ xor $TX[1],$TX[1]
- inc $XX[0]#b
-+ sub $XX[0],$TX[1]
-+ sub $inp,$out
- movl ($dat,$XX[0],4),$TX[0]#d
-- test \$-8,$len
-+ test \$-16,$len
- jz .Lloop1
-- jmp .Lloop8
-+ bt \$30,%r8d # Intel CPU?
-+ jc .Lintel
-+ and \$7,$TX[1]
-+ lea 1($XX[0]),$XX[1]
-+ jz .Loop8
-+ sub $TX[1],$len
-+.Loop8_warmup:
-+ add $TX[0]#b,$YY#b
-+ movl ($dat,$YY,4),$TY#d
-+ movl $TX[0]#d,($dat,$YY,4)
-+ movl $TY#d,($dat,$XX[0],4)
-+ add $TY#b,$TX[0]#b
-+ inc $XX[0]#b
-+ movl ($dat,$TX[0],4),$TY#d
-+ movl ($dat,$XX[0],4),$TX[0]#d
-+ xorb ($inp),$TY#b
-+ movb $TY#b,($out,$inp)
-+ lea 1($inp),$inp
-+ dec $TX[1]
-+ jnz .Loop8_warmup
++my $rdrand = sub {
++ if (shift =~ /%[er](\w+)/) {
++ my @opcode=();
++ my $dst=$1;
++ if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
++ rex(\@opcode,0,$1,8);
++ push @opcode,0x0f,0xc7,0xf0|($dst&7);
++ @opcode;
++ } else {
++ ();
++ }
++};
+
-+ lea 1($XX[0]),$XX[1]
-+ jmp .Loop8
- .align 16
--.Lloop8:
-+.Loop8:
- ___
- for ($i=0;$i<8;$i++) {
-+$code.=<<___ if ($i==7);
-+ add \$8,$XX[1]#b
-+___
- $code.=<<___;
- add $TX[0]#b,$YY#b
-- mov $XX[0],$XX[1]
- movl ($dat,$YY,4),$TY#d
-- ror \$8,%rax # ror is redundant when $i=0
-- inc $XX[1]#b
-- movl ($dat,$XX[1],4),$TX[1]#d
-- cmp $XX[1],$YY
- movl $TX[0]#d,($dat,$YY,4)
-- cmove $TX[0],$TX[1]
-- movl $TY#d,($dat,$XX[0],4)
-+ movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
-+ ror \$8,%r8 # ror is redundant when $i=0
-+ movl $TY#d,4*$i($dat,$XX[0],4)
- add $TX[0]#b,$TY#b
-- movb ($dat,$TY,4),%al
-+ movb ($dat,$TY,4),%r8b
+ if ($nasm) {
+ print <<___;
+ default rel
++%define XMMWORD
___
--push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
-+push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
- }
- $code.=<<___;
-- ror \$8,%rax
-+ add \$8,$XX[0]#b
-+ ror \$8,%r8
- sub \$8,$len
+ } elsif ($masm) {
+ print <<___;
+@@ -641,14 +790,22 @@ while($line=<>) {
-- xor ($inp),%rax
-- add \$8,$inp
-- mov %rax,($out)
-- add \$8,$out
-+ xor ($inp),%r8
-+ mov %r8,($out,$inp)
-+ lea 8($inp),$inp
+ undef $label;
+ undef $opcode;
+- undef $sz;
+ undef @args;
- test \$-8,$len
-- jnz .Lloop8
-+ jnz .Loop8
-+ cmp \$0,$len
-+ jne .Lloop1
-+ jmp .Lexit
+ if ($label=label->re(\$line)) { print $label->out(); }
+
+ if (directive->re(\$line)) {
+ printf "%s",directive->out();
+- } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
++ } elsif ($opcode=opcode->re(\$line)) {
++ my $asm = eval("\$".$opcode->mnemonic());
++ undef @bytes;
++
++ if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
++ print $gas?".byte\t":"DB\t",join(',', at bytes),"\n";
++ next;
++ }
+
-+.align 16
-+.Lintel:
-+ test \$-32,$len
-+ jz .Lloop1
-+ and \$15,$TX[1]
-+ jz .Loop16_is_hot
-+ sub $TX[1],$len
-+.Loop16_warmup:
-+ add $TX[0]#b,$YY#b
-+ movl ($dat,$YY,4),$TY#d
-+ movl $TX[0]#d,($dat,$YY,4)
-+ movl $TY#d,($dat,$XX[0],4)
-+ add $TY#b,$TX[0]#b
-+ inc $XX[0]#b
-+ movl ($dat,$TX[0],4),$TY#d
-+ movl ($dat,$XX[0],4),$TX[0]#d
-+ xorb ($inp),$TY#b
-+ movb $TY#b,($out,$inp)
-+ lea 1($inp),$inp
-+ dec $TX[1]
-+ jnz .Loop16_warmup
++ ARGUMENT: while (1) {
+ my $arg;
+
+ if ($arg=register->re(\$line)) { opcode->size($arg->size()); }
+@@ -664,19 +821,26 @@ while($line=<>) {
+ $line =~ s/^,\s*//;
+ } # ARGUMENT:
+
+- $sz=opcode->size();
+-
+ if ($#args>=0) {
+ my $insn;
++ my $sz=opcode->size();
+
-+ mov $YY,$TX[1]
-+ xor $YY,$YY
-+ mov $TX[1]#b,$YY#b
+ if ($gas) {
+ $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
++ @args = map($_->out($sz), at args);
++ printf "\t%s\t%s",$insn,join(",", at args);
+ } else {
+ $insn = $opcode->out();
+- $insn .= $sz if (map($_->out() =~ /x?mm/, at args));
++ foreach (@args) {
++ my $arg = $_->out();
++ # $insn.=$sz compensates for movq, pinsrw, ...
++ if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
++ if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; }
++ }
+ @args = reverse(@args);
+ undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
++ printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
+ }
+- printf "\t%s\t%s",$insn,join(",",map($_->out($sz), at args));
+ } else {
+ printf "\t%s",$opcode->out();
+ }
+diff -up openssl-1.0.0k/crypto/perlasm/x86asm.pl.intelopts openssl-1.0.0k/crypto/perlasm/x86asm.pl
+--- openssl-1.0.0k/crypto/perlasm/x86asm.pl.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/perlasm/x86asm.pl 2013-02-19 21:15:39.611407695 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+
+ # require 'x86asm.pl';
+ # &asm_init(<flavor>,"des-586.pl"[,$i386only]);
+@@ -80,6 +80,57 @@ sub ::movq
+ { &::generic("movq", at _); }
+ }
+
++# SSE>2 instructions
++my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
++ "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 );
++sub ::pextrd
++{ my($dst,$src,$imm)=@_;
++ if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
++ { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); }
++ else
++ { &::generic("pextrd", at _); }
++}
+
-+.Loop16_is_hot:
-+ lea ($dat,$XX[0],4),$XX[1]
-+___
-+sub RC4_loop {
-+ my $i=shift;
-+ my $j=$i<0?0:$i;
-+ my $xmm="%xmm".($j&1);
++sub ::pinsrd
++{ my($dst,$src,$imm)=@_;
++ if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
++ { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); }
++ else
++ { &::generic("pinsrd", at _); }
++}
+
-+ $code.=" add \$16,$XX[0]#b\n" if ($i==15);
-+ $code.=" movdqu ($inp),%xmm2\n" if ($i==15);
-+ $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
-+ $code.=" movl ($dat,$YY,4),$TY#d\n";
-+ $code.=" pxor %xmm0,%xmm2\n" if ($i==0);
-+ $code.=" psllq \$8,%xmm1\n" if ($i==0);
-+ $code.=" pxor $xmm,$xmm\n" if ($i<=1);
-+ $code.=" movl $TX[0]#d,($dat,$YY,4)\n";
-+ $code.=" add $TY#b,$TX[0]#b\n";
-+ $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
-+ $code.=" movz $TX[0]#b,$TX[0]#d\n";
-+ $code.=" movl $TY#d,`4*$j`($XX[1])\n";
-+ $code.=" pxor %xmm1,%xmm2\n" if ($i==0);
-+ $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
-+ $code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
-+ $code.=" pinsrw \$`$j>>1`,($dat,$TX[0],4),$xmm\n";
-+ $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
-+ $code.=" lea 16($inp),$inp\n" if ($i==0);
-+ $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
++sub ::pshufb
++{ my($dst,$src)=@_;
++ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++ { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); }
++ else
++ { &::generic("pshufb", at _); }
+}
-+ RC4_loop(-1);
-+$code.=<<___;
-+ jmp .Loop16_enter
-+.align 16
-+.Loop16:
-+___
+
-+for ($i=0;$i<16;$i++) {
-+ $code.=".Loop16_enter:\n" if ($i==1);
-+ RC4_loop($i);
-+ push(@TX,shift(@TX)); # "rotate" registers
++sub ::palignr
++{ my($dst,$src,$imm)=@_;
++ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++ { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); }
++ else
++ { &::generic("palignr", at _); }
+}
-+$code.=<<___;
-+ mov $YY,$TX[1]
-+ xor $YY,$YY # keyword to partial register
-+ sub \$16,$len
-+ mov $TX[1]#b,$YY#b
-+ test \$-16,$len
-+ jnz .Loop16
+
-+ psllq \$8,%xmm1
-+ pxor %xmm0,%xmm2
-+ pxor %xmm1,%xmm2
-+ movdqu %xmm2,($out,$inp)
-+ lea 16($inp),$inp
++sub ::pclmulqdq
++{ my($dst,$src,$imm)=@_;
++ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
++ { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); }
++ else
++ { &::generic("pclmulqdq", at _); }
++}
+
- cmp \$0,$len
- jne .Lloop1
- jmp .Lexit
-@@ -152,9 +314,8 @@ $code.=<<___;
- movl ($dat,$TX[0],4),$TY#d
- movl ($dat,$XX[0],4),$TX[0]#d
- xorb ($inp),$TY#b
-- inc $inp
-- movb $TY#b,($out)
-- inc $out
-+ movb $TY#b,($out,$inp)
-+ lea 1($inp),$inp
- dec $len
- jnz .Lloop1
- jmp .Lexit
-@@ -165,13 +326,11 @@ $code.=<<___;
- movzb ($dat,$XX[0]),$TX[0]#d
- test \$-8,$len
- jz .Lcloop1
-- cmpl \$0,260($dat)
-- jnz .Lcloop1
- jmp .Lcloop8
- .align 16
- .Lcloop8:
-- mov ($inp),%eax
-- mov 4($inp),%ebx
-+ mov ($inp),%r8d
-+ mov 4($inp),%r9d
- ___
- # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
- for ($i=0;$i<4;$i++) {
-@@ -188,8 +347,8 @@ $code.=<<___;
- mov $TX[0],$TX[1]
- .Lcmov$i:
- add $TX[0]#b,$TY#b
-- xor ($dat,$TY),%al
-- ror \$8,%eax
-+ xor ($dat,$TY),%r8b
-+ ror \$8,%r8d
- ___
- push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
- }
-@@ -207,16 +366,16 @@ $code.=<<___;
- mov $TX[0],$TX[1]
- .Lcmov$i:
- add $TX[0]#b,$TY#b
-- xor ($dat,$TY),%bl
-- ror \$8,%ebx
-+ xor ($dat,$TY),%r9b
-+ ror \$8,%r9d
- ___
- push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
- }
- $code.=<<___;
- lea -8($len),$len
-- mov %eax,($out)
-+ mov %r8d,($out)
- lea 8($inp),$inp
-- mov %ebx,4($out)
-+ mov %r9d,4($out)
- lea 8($out),$out
-
- test \$-8,$len
-@@ -229,6 +388,7 @@ $code.=<<___;
- .align 16
- .Lcloop1:
- add $TX[0]#b,$YY#b
-+ movzb $YY#b,$YY#d
- movzb ($dat,$YY),$TY#d
- movb $TX[0]#b,($dat,$YY)
- movb $TY#b,($dat,$XX[0])
-@@ -260,12 +420,12 @@ $code.=<<___;
- ret
- .size RC4,.-RC4
- ___
++sub ::rdrand
++{ my ($dst)=@_;
++ if ($dst =~ /(e[a-dsd][ixp])/)
++ { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); }
++ else
++ { &::generic("rdrand", at _); }
+}
++
+ # label management
+ $lbdecor="L"; # local label decoration, set by package
+ $label="000";
+diff -up openssl-1.0.0k/crypto/perlasm/x86gas.pl.intelopts openssl-1.0.0k/crypto/perlasm/x86gas.pl
+--- openssl-1.0.0k/crypto/perlasm/x86gas.pl.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/perlasm/x86gas.pl 2013-02-19 21:15:39.617407816 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
- $idx="%r8";
- $ido="%r9";
-
- $code.=<<___;
--.extern OPENSSL_ia32cap_P
- .globl RC4_set_key
- .type RC4_set_key,\@function,3
- .align 16
-@@ -280,12 +440,9 @@ RC4_set_key:
- xor %r11,%r11
-
- mov OPENSSL_ia32cap_P(%rip),$idx#d
-- bt \$20,$idx#d
-- jnc .Lw1stloop
-- bt \$30,$idx#d
-- setc $ido#b
-- mov $ido#d,260($dat)
-- jmp .Lc1stloop
-+ bt \$20,$idx#d # RC4_CHAR?
-+ jc .Lc1stloop
-+ jmp .Lw1stloop
+ package x86gas;
- .align 16
- .Lw1stloop:
-@@ -348,18 +505,20 @@ RC4_options:
- lea .Lopts(%rip),%rax
- mov OPENSSL_ia32cap_P(%rip),%edx
- bt \$20,%edx
-- jnc .Ldone
-- add \$12,%rax
-+ jc .L8xchar
- bt \$30,%edx
- jnc .Ldone
-- add \$13,%rax
-+ add \$25,%rax
-+ ret
-+.L8xchar:
-+ add \$12,%rax
- .Ldone:
- ret
- .align 64
- .Lopts:
- .asciz "rc4(8x,int)"
- .asciz "rc4(8x,char)"
--.asciz "rc4(1x,char)"
-+.asciz "rc4(16x,int)"
- .asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
- .align 64
- .size RC4_options,.-RC4_options
-@@ -497,8 +656,17 @@ key_se_handler:
- ___
+@@ -91,6 +91,7 @@ sub ::DWP
}
+ sub ::QWP { &::DWP(@_); }
+ sub ::BP { &::DWP(@_); }
++sub ::WP { &::DWP(@_); }
+ sub ::BC { @_; }
+ sub ::DWC { @_; }
--$code =~ s/#([bwd])/$1/gm;
-+sub reg_part {
-+my ($reg,$conv)=@_;
-+ if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
-+ elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
-+ elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
-+ elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
-+ return $reg;
-+}
+@@ -161,10 +162,16 @@ sub ::file_end
+ { push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n"); }
+ }
+ }
++ if (grep {/\b${nmdecor}OPENSSL_ia32cap_X\b/i} @out) {
++ my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_X,8";
++ if ($::elf) { push (@out,"$tmp,4\n"); }
++ else { push (@out,"$tmp\n"); }
++ }
+ push(@out,$initseg) if ($initseg);
+ }
-+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
-+$code =~ s/\`([^\`]*)\`/eval $1/gem;
- $code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
+ sub ::data_byte { push(@out,".byte\t".join(',', at _)."\n"); }
++sub ::data_short{ push(@out,".value\t".join(',', at _)."\n"); }
+ sub ::data_word { push(@out,".long\t".join(',', at _)."\n"); }
- print $code;
-diff -up openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl.intelopts openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl
---- openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl.intelopts 2011-08-24 12:50:55.000000000 +0200
-+++ openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl 2011-08-24 12:50:56.000000000 +0200
+ sub ::align
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.intelopts openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl.intelopts 2013-02-19 21:15:39.360402569 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-586.pl 2013-02-19 21:15:39.623407939 +0100
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/perl
@@ -3447,1906 +2999,2219 @@ diff -up openssl-1.0.0d/crypto/rc4/asm/rc4-586.pl.intelopts openssl-1.0.0d/crypt
&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
&align (64);
&function_end_B("RC4_options");
-diff -up openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl.intelopts openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl
---- openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl.intelopts 2010-01-17 17:58:56.000000000 +0100
-+++ openssl-1.0.0d/crypto/sha/asm/sha1-x86_64.pl 2011-08-24 12:50:56.000000000 +0200
+diff -up openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.intelopts openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl
+--- openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl.intelopts 2013-02-19 21:15:39.360402569 +0100
++++ openssl-1.0.0k/crypto/rc4/asm/rc4-x86_64.pl 2013-02-19 21:15:39.621407898 +0100
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/perl
#
# ====================================================================
# Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -16,7 +16,7 @@
- # There was suggestion to mechanically translate 32-bit code, but I
- # dismissed it, reasoning that x86_64 offers enough register bank
- # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
--# implementation:-) However! While 64-bit code does performs better
-+# implementation:-) However! While 64-bit code does perform better
- # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
- # x86_64 does offer larger *addressable* bank, but out-of-order core
- # reaches for even more registers through dynamic aliasing, and EM64T
-@@ -29,6 +29,38 @@
- # Xeon P4 +65% +0% 9.9
- # Core2 +60% +10% 7.0
+@@ -7,6 +7,8 @@
+ # details see http://www.openssl.org/~appro/cryptogams/.
+ # ====================================================================
+ #
++# July 2004
++#
+ # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
+ # "hand-coded assembler"] doesn't stand for the whole improvement
+ # coefficient. It turned out that eliminating RC4_CHAR from config
+@@ -19,6 +21,8 @@
+ # to operate on partial registers, it turned out to be the best bet.
+ # At least for AMD... How IA32E would perform remains to be seen...
+
++# November 2004
++#
+ # As was shown by Marc Bevand reordering of couple of load operations
+ # results in even higher performance gain of 3.3x:-) At least on
+ # Opteron... For reference, 1x in this case is RC4_CHAR C-code
+@@ -26,6 +30,8 @@
+ # Latter means that if you want to *estimate* what to expect from
+ # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
+
++# November 2004
++#
+ # Intel P4 EM64T core was found to run the AMD64 code really slow...
+ # The only way to achieve comparable performance on P4 was to keep
+ # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+@@ -33,10 +39,14 @@
+ # on either AMD and Intel platforms, I implement both cases. See
+ # rc4_skey.c for further details...
+
++# April 2005
++#
+ # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
+ # those with add/sub results in 50% performance improvement of folded
+ # loop...
+
++# May 2005
++#
+ # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
+ # performance by >30% [unlike P4 32-bit case that is]. But this is
+ # provided that loads are reordered even more aggressively! Both code
+@@ -50,6 +60,8 @@
+ # is not implemented, then this final RC4_CHAR code-path should be
+ # preferred, as it provides better *all-round* performance].
+
++# March 2007
++#
+ # Intel Core2 was observed to perform poorly on both code paths:-( It
+ # apparently suffers from some kind of partial register stall, which
+ # occurs in 64-bit mode only [as virtually identical 32-bit loop was
+@@ -58,6 +70,34 @@
+ # fit for Core2 and therefore the code was modified to skip cloop8 on
+ # this CPU.
-+# August 2009.
++# May 2010
+#
-+# The code was revised to minimize code size and to maximize
-+# "distance" between instructions producing input to 'lea'
-+# instruction and the 'lea' instruction itself, which is essential
-+# for Intel Atom core.
++# Intel Westmere was observed to perform suboptimally. Adding yet
++# another movzb to cloop1 improved performance by almost 50%! Core2
++# performance is improved too, but nominally...
+
-+# October 2010.
++# May 2011
+#
-+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
-+# is to offload message schedule denoted by Wt in NIST specification,
-+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
-+# for background and implementation details. The only difference from
-+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
-+# to free temporary registers.
-+
-+# April 2011.
++# The only code path that was not modified is P4-specific one. Non-P4
++# Intel code path optimization is heavily based on submission by Maxim
++# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
++# some of the ideas even in attempt to optmize the original RC4_INT
++# code path... Current performance in cycles per processed byte (less
++# is better) and improvement coefficients relative to previous
++# version of this module are:
+#
-+# Add AVX code path. See sha1-586.pl for further information.
-+
-+######################################################################
-+# Current performance is summarized in following table. Numbers are
-+# CPU clock cycles spent to process single byte (less is better).
++# Opteron 5.3/+0%
++# P4 6.5
++# Core2 6.2/+15%(*)
++# Westmere 4.2/+60%
++# Sandy Bridge 4.2/+120%
++# Atom 9.3/+80%
+#
-+# x86_64 SSSE3 AVX
-+# P4 9.8 -
-+# Opteron 6.6 -
-+# Core2 6.7 6.1/+10% -
-+# Atom 11.0 9.7/+13% -
-+# Westmere 7.1 5.6/+27% -
-+# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
++# (*) Note that Core2 result is ~15% lower than corresponding result
++# for 32-bit code, meaning that it's possible to improve it,
++# but more than likely at the cost of the others (see rc4-586.pl
++# to get the idea)...
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-@@ -40,6 +72,13 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
+@@ -76,13 +116,10 @@ $len="%rsi"; # arg2
+ $inp="%rdx"; # arg3
+ $out="%rcx"; # arg4
-+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
-+ $1>=2.19);
-+$avx=1 if (!$avx && $flavour =~ /nasm/ &&
-+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
-+ $1>=2.03);
+- at XX=("%r8","%r10");
+- at TX=("%r9","%r11");
+-$YY="%r12";
+-$TY="%r13";
+-
++{
+ $code=<<___;
+ .text
++.extern OPENSSL_ia32cap_P
+
+ .globl RC4
+ .type RC4,\@function,4
+@@ -95,48 +132,173 @@ RC4: or $len,$len
+ push %r12
+ push %r13
+ .Lprologue:
++ mov $len,%r11
++ mov $inp,%r12
++ mov $out,%r13
++___
++my $len="%r11"; # reassign input arguments
++my $inp="%r12";
++my $out="%r13";
++
++my @XX=("%r10","%rsi");
++my @TX=("%rax","%rbx");
++my $YY="%rcx";
++my $TY="%rdx";
+
+- add \$8,$dat
+- movl -8($dat),$XX[0]#d
+- movl -4($dat),$YY#d
++$code.=<<___;
++ xor $XX[0],$XX[0]
++ xor $YY,$YY
++
++ lea 8($dat),$dat
++ mov -8($dat),$XX[0]#b
++ mov -4($dat),$YY#b
+ cmpl \$-1,256($dat)
+ je .LRC4_CHAR
++ mov OPENSSL_ia32cap_P(%rip),%r8d
++ xor $TX[1],$TX[1]
+ inc $XX[0]#b
++ sub $XX[0],$TX[1]
++ sub $inp,$out
+ movl ($dat,$XX[0],4),$TX[0]#d
+- test \$-8,$len
++ test \$-16,$len
+ jz .Lloop1
+- jmp .Lloop8
++ bt \$30,%r8d # Intel CPU?
++ jc .Lintel
++ and \$7,$TX[1]
++ lea 1($XX[0]),$XX[1]
++ jz .Loop8
++ sub $TX[1],$len
++.Loop8_warmup:
++ add $TX[0]#b,$YY#b
++ movl ($dat,$YY,4),$TY#d
++ movl $TX[0]#d,($dat,$YY,4)
++ movl $TY#d,($dat,$XX[0],4)
++ add $TY#b,$TX[0]#b
++ inc $XX[0]#b
++ movl ($dat,$TX[0],4),$TY#d
++ movl ($dat,$XX[0],4),$TX[0]#d
++ xorb ($inp),$TY#b
++ movb $TY#b,($out,$inp)
++ lea 1($inp),$inp
++ dec $TX[1]
++ jnz .Loop8_warmup
++
++ lea 1($XX[0]),$XX[1]
++ jmp .Loop8
+ .align 16
+-.Lloop8:
++.Loop8:
+ ___
+ for ($i=0;$i<8;$i++) {
++$code.=<<___ if ($i==7);
++ add \$8,$XX[1]#b
++___
+ $code.=<<___;
+ add $TX[0]#b,$YY#b
+- mov $XX[0],$XX[1]
+ movl ($dat,$YY,4),$TY#d
+- ror \$8,%rax # ror is redundant when $i=0
+- inc $XX[1]#b
+- movl ($dat,$XX[1],4),$TX[1]#d
+- cmp $XX[1],$YY
+ movl $TX[0]#d,($dat,$YY,4)
+- cmove $TX[0],$TX[1]
+- movl $TY#d,($dat,$XX[0],4)
++ movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
++ ror \$8,%r8 # ror is redundant when $i=0
++ movl $TY#d,4*$i($dat,$XX[0],4)
+ add $TX[0]#b,$TY#b
+- movb ($dat,$TY,4),%al
++ movb ($dat,$TY,4),%r8b
+ ___
+-push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
++push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
+ }
+ $code.=<<___;
+- ror \$8,%rax
++ add \$8,$XX[0]#b
++ ror \$8,%r8
+ sub \$8,$len
+
+- xor ($inp),%rax
+- add \$8,$inp
+- mov %rax,($out)
+- add \$8,$out
++ xor ($inp),%r8
++ mov %r8,($out,$inp)
++ lea 8($inp),$inp
+
+ test \$-8,$len
+- jnz .Lloop8
++ jnz .Loop8
++ cmp \$0,$len
++ jne .Lloop1
++ jmp .Lexit
++
++.align 16
++.Lintel:
++ test \$-32,$len
++ jz .Lloop1
++ and \$15,$TX[1]
++ jz .Loop16_is_hot
++ sub $TX[1],$len
++.Loop16_warmup:
++ add $TX[0]#b,$YY#b
++ movl ($dat,$YY,4),$TY#d
++ movl $TX[0]#d,($dat,$YY,4)
++ movl $TY#d,($dat,$XX[0],4)
++ add $TY#b,$TX[0]#b
++ inc $XX[0]#b
++ movl ($dat,$TX[0],4),$TY#d
++ movl ($dat,$XX[0],4),$TX[0]#d
++ xorb ($inp),$TY#b
++ movb $TY#b,($out,$inp)
++ lea 1($inp),$inp
++ dec $TX[1]
++ jnz .Loop16_warmup
++
++ mov $YY,$TX[1]
++ xor $YY,$YY
++ mov $TX[1]#b,$YY#b
++
++.Loop16_is_hot:
++ lea ($dat,$XX[0],4),$XX[1]
++___
++sub RC4_loop {
++ my $i=shift;
++ my $j=$i<0?0:$i;
++ my $xmm="%xmm".($j&1);
++
++ $code.=" add \$16,$XX[0]#b\n" if ($i==15);
++ $code.=" movdqu ($inp),%xmm2\n" if ($i==15);
++ $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
++ $code.=" movl ($dat,$YY,4),$TY#d\n";
++ $code.=" pxor %xmm0,%xmm2\n" if ($i==0);
++ $code.=" psllq \$8,%xmm1\n" if ($i==0);
++ $code.=" pxor $xmm,$xmm\n" if ($i<=1);
++ $code.=" movl $TX[0]#d,($dat,$YY,4)\n";
++ $code.=" add $TY#b,$TX[0]#b\n";
++ $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
++ $code.=" movz $TX[0]#b,$TX[0]#d\n";
++ $code.=" movl $TY#d,`4*$j`($XX[1])\n";
++ $code.=" pxor %xmm1,%xmm2\n" if ($i==0);
++ $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
++ $code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
++ $code.=" pinsrw \$`$j>>1`,($dat,$TX[0],4),$xmm\n";
++ $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
++ $code.=" lea 16($inp),$inp\n" if ($i==0);
++ $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
++}
++ RC4_loop(-1);
++$code.=<<___;
++ jmp .Loop16_enter
++.align 16
++.Loop16:
++___
++
++for ($i=0;$i<16;$i++) {
++ $code.=".Loop16_enter:\n" if ($i==1);
++ RC4_loop($i);
++ push(@TX,shift(@TX)); # "rotate" registers
++}
++$code.=<<___;
++ mov $YY,$TX[1]
++ xor $YY,$YY # keyword to partial register
++ sub \$16,$len
++ mov $TX[1]#b,$YY#b
++ test \$-16,$len
++ jnz .Loop16
+
- open STDOUT,"| $^X $xlate $flavour $output";
-
- $ctx="%rdi"; # 1st arg
-@@ -51,196 +90,994 @@ $ctx="%r8";
- $inp="%r9";
- $num="%r10";
-
--$xi="%eax";
--$t0="%ebx";
--$t1="%ecx";
--$A="%edx";
--$B="%esi";
--$C="%edi";
--$D="%ebp";
--$E="%r11d";
--$T="%r12d";
--
-- at V=($A,$B,$C,$D,$E,$T);
-+$t0="%eax";
-+$t1="%ebx";
-+$t2="%ecx";
-+ at xi=("%edx","%ebp");
-+$A="%esi";
-+$B="%edi";
-+$C="%r11d";
-+$D="%r12d";
-+$E="%r13d";
-
--sub PROLOGUE {
--my $func=shift;
--$code.=<<___;
--.globl $func
--.type $func,\@function,3
--.align 16
--$func:
-- push %rbx
-- push %rbp
-- push %r12
-- mov %rsp,%r11
-- mov %rdi,$ctx # reassigned argument
-- sub \$`8+16*4`,%rsp
-- mov %rsi,$inp # reassigned argument
-- and \$-64,%rsp
-- mov %rdx,$num # reassigned argument
-- mov %r11,`16*4`(%rsp)
--.Lprologue:
--
-- mov 0($ctx),$A
-- mov 4($ctx),$B
-- mov 8($ctx),$C
-- mov 12($ctx),$D
-- mov 16($ctx),$E
--___
--}
--
--sub EPILOGUE {
--my $func=shift;
--$code.=<<___;
-- mov `16*4`(%rsp),%rsi
-- mov (%rsi),%r12
-- mov 8(%rsi),%rbp
-- mov 16(%rsi),%rbx
-- lea 24(%rsi),%rsp
--.Lepilogue:
-- ret
--.size $func,.-$func
--___
--}
-+ at V=($A,$B,$C,$D,$E);
-
- sub BODY_00_19 {
--my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
-+my ($i,$a,$b,$c,$d,$e)=@_;
- my $j=$i+1;
- $code.=<<___ if ($i==0);
-- mov `4*$i`($inp),$xi
-- `"bswap $xi" if(!defined($host))`
-- mov $xi,`4*$i`(%rsp)
-+ mov `4*$i`($inp),$xi[0]
-+ bswap $xi[0]
-+ mov $xi[0],`4*$i`(%rsp)
++ psllq \$8,%xmm1
++ pxor %xmm0,%xmm2
++ pxor %xmm1,%xmm2
++ movdqu %xmm2,($out,$inp)
++ lea 16($inp),$inp
++
+ cmp \$0,$len
+ jne .Lloop1
+ jmp .Lexit
+@@ -152,9 +314,8 @@ $code.=<<___;
+ movl ($dat,$TX[0],4),$TY#d
+ movl ($dat,$XX[0],4),$TX[0]#d
+ xorb ($inp),$TY#b
+- inc $inp
+- movb $TY#b,($out)
+- inc $out
++ movb $TY#b,($out,$inp)
++ lea 1($inp),$inp
+ dec $len
+ jnz .Lloop1
+ jmp .Lexit
+@@ -165,13 +326,11 @@ $code.=<<___;
+ movzb ($dat,$XX[0]),$TX[0]#d
+ test \$-8,$len
+ jz .Lcloop1
+- cmpl \$0,260($dat)
+- jnz .Lcloop1
+ jmp .Lcloop8
+ .align 16
+ .Lcloop8:
+- mov ($inp),%eax
+- mov 4($inp),%ebx
++ mov ($inp),%r8d
++ mov 4($inp),%r9d
___
- $code.=<<___ if ($i<15);
-- lea 0x5a827999($xi,$e),$f
- mov $c,$t0
-- mov `4*$j`($inp),$xi
-- mov $a,$e
-+ mov `4*$j`($inp),$xi[1]
-+ mov $a,$t2
- xor $d,$t0
-- `"bswap $xi" if(!defined($host))`
-- rol \$5,$e
-+ bswap $xi[1]
-+ rol \$5,$t2
-+ lea 0x5a827999($xi[0],$e),$e
- and $b,$t0
-- mov $xi,`4*$j`(%rsp)
-- add $e,$f
-+ mov $xi[1],`4*$j`(%rsp)
-+ add $t2,$e
- xor $d,$t0
- rol \$30,$b
-- add $t0,$f
-+ add $t0,$e
+ # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
+ for ($i=0;$i<4;$i++) {
+@@ -188,8 +347,8 @@ $code.=<<___;
+ mov $TX[0],$TX[1]
+ .Lcmov$i:
+ add $TX[0]#b,$TY#b
+- xor ($dat,$TY),%al
+- ror \$8,%eax
++ xor ($dat,$TY),%r8b
++ ror \$8,%r8d
___
- $code.=<<___ if ($i>=15);
-- lea 0x5a827999($xi,$e),$f
-- mov `4*($j%16)`(%rsp),$xi
-+ mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
-- mov $a,$e
-- xor `4*(($j+2)%16)`(%rsp),$xi
-+ mov $a,$t2
-+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $d,$t0
-- rol \$5,$e
-- xor `4*(($j+8)%16)`(%rsp),$xi
-+ rol \$5,$t2
-+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
- and $b,$t0
-- add $e,$f
-- xor `4*(($j+13)%16)`(%rsp),$xi
-+ lea 0x5a827999($xi[0],$e),$e
-+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
- xor $d,$t0
-+ rol \$1,$xi[1]
-+ add $t2,$e
- rol \$30,$b
-- add $t0,$f
-- rol \$1,$xi
-- mov $xi,`4*($j%16)`(%rsp)
-+ mov $xi[1],`4*($j%16)`(%rsp)
-+ add $t0,$e
+ push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
+ }
+@@ -207,16 +366,16 @@ $code.=<<___;
+ mov $TX[0],$TX[1]
+ .Lcmov$i:
+ add $TX[0]#b,$TY#b
+- xor ($dat,$TY),%bl
+- ror \$8,%ebx
++ xor ($dat,$TY),%r9b
++ ror \$8,%r9d
___
-+unshift(@xi,pop(@xi));
+ push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
}
+ $code.=<<___;
+ lea -8($len),$len
+- mov %eax,($out)
++ mov %r8d,($out)
+ lea 8($inp),$inp
+- mov %ebx,4($out)
++ mov %r9d,4($out)
+ lea 8($out),$out
- sub BODY_20_39 {
--my ($i,$a,$b,$c,$d,$e,$f)=@_;
-+my ($i,$a,$b,$c,$d,$e)=@_;
- my $j=$i+1;
- my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
- $code.=<<___ if ($i<79);
-- lea $K($xi,$e),$f
-- mov `4*($j%16)`(%rsp),$xi
-+ mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
-- mov $a,$e
-- xor `4*(($j+2)%16)`(%rsp),$xi
-+ mov $a,$t2
-+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $b,$t0
-- rol \$5,$e
-- xor `4*(($j+8)%16)`(%rsp),$xi
-+ rol \$5,$t2
-+ lea $K($xi[0],$e),$e
-+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
- xor $d,$t0
-- add $e,$f
-- xor `4*(($j+13)%16)`(%rsp),$xi
-+ add $t2,$e
-+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
- rol \$30,$b
-- add $t0,$f
-- rol \$1,$xi
-+ add $t0,$e
-+ rol \$1,$xi[1]
- ___
- $code.=<<___ if ($i<76);
-- mov $xi,`4*($j%16)`(%rsp)
-+ mov $xi[1],`4*($j%16)`(%rsp)
- ___
- $code.=<<___ if ($i==79);
-- lea $K($xi,$e),$f
- mov $c,$t0
-- mov $a,$e
-+ mov $a,$t2
- xor $b,$t0
-- rol \$5,$e
-+ lea $K($xi[0],$e),$e
-+ rol \$5,$t2
- xor $d,$t0
-- add $e,$f
-+ add $t2,$e
- rol \$30,$b
-- add $t0,$f
-+ add $t0,$e
+ test \$-8,$len
+@@ -229,6 +388,7 @@ $code.=<<___;
+ .align 16
+ .Lcloop1:
+ add $TX[0]#b,$YY#b
++ movzb $YY#b,$YY#d
+ movzb ($dat,$YY),$TY#d
+ movb $TX[0]#b,($dat,$YY)
+ movb $TY#b,($dat,$XX[0])
+@@ -260,12 +420,12 @@ $code.=<<___;
+ ret
+ .size RC4,.-RC4
___
-+unshift(@xi,pop(@xi));
- }
++}
+
+ $idx="%r8";
+ $ido="%r9";
- sub BODY_40_59 {
--my ($i,$a,$b,$c,$d,$e,$f)=@_;
-+my ($i,$a,$b,$c,$d,$e)=@_;
- my $j=$i+1;
$code.=<<___;
-- lea 0x8f1bbcdc($xi,$e),$f
-- mov `4*($j%16)`(%rsp),$xi
-- mov $b,$t0
-- mov $b,$t1
-- xor `4*(($j+2)%16)`(%rsp),$xi
-- mov $a,$e
-- and $c,$t0
-- xor `4*(($j+8)%16)`(%rsp),$xi
-- or $c,$t1
-- rol \$5,$e
-- xor `4*(($j+13)%16)`(%rsp),$xi
-- and $d,$t1
-- add $e,$f
-- rol \$1,$xi
-- or $t1,$t0
-+ mov `4*($j%16)`(%rsp),$xi[1]
-+ mov $c,$t0
-+ mov $c,$t1
-+ xor `4*(($j+2)%16)`(%rsp),$xi[1]
-+ and $d,$t0
-+ mov $a,$t2
-+ xor `4*(($j+8)%16)`(%rsp),$xi[1]
-+ xor $d,$t1
-+ lea 0x8f1bbcdc($xi[0],$e),$e
-+ rol \$5,$t2
-+ xor `4*(($j+13)%16)`(%rsp),$xi[1]
-+ add $t0,$e
-+ and $b,$t1
-+ rol \$1,$xi[1]
-+ add $t1,$e
- rol \$30,$b
-- mov $xi,`4*($j%16)`(%rsp)
-- add $t0,$f
-+ mov $xi[1],`4*($j%16)`(%rsp)
-+ add $t2,$e
+-.extern OPENSSL_ia32cap_P
+ .globl RC4_set_key
+ .type RC4_set_key,\@function,3
+ .align 16
+@@ -280,12 +440,9 @@ RC4_set_key:
+ xor %r11,%r11
+
+ mov OPENSSL_ia32cap_P(%rip),$idx#d
+- bt \$20,$idx#d
+- jnc .Lw1stloop
+- bt \$30,$idx#d
+- setc $ido#b
+- mov $ido#d,260($dat)
+- jmp .Lc1stloop
++ bt \$20,$idx#d # RC4_CHAR?
++ jc .Lc1stloop
++ jmp .Lw1stloop
+
+ .align 16
+ .Lw1stloop:
+@@ -348,18 +505,20 @@ RC4_options:
+ lea .Lopts(%rip),%rax
+ mov OPENSSL_ia32cap_P(%rip),%edx
+ bt \$20,%edx
+- jnc .Ldone
+- add \$12,%rax
++ jc .L8xchar
+ bt \$30,%edx
+ jnc .Ldone
+- add \$13,%rax
++ add \$25,%rax
++ ret
++.L8xchar:
++ add \$12,%rax
+ .Ldone:
+ ret
+ .align 64
+ .Lopts:
+ .asciz "rc4(8x,int)"
+ .asciz "rc4(8x,char)"
+-.asciz "rc4(1x,char)"
++.asciz "rc4(16x,int)"
+ .asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+ .align 64
+ .size RC4_options,.-RC4_options
+@@ -497,8 +656,17 @@ key_se_handler:
___
-+unshift(@xi,pop(@xi));
}
--$code=".text\n";
-+$code.=<<___;
-+.text
-+.extern OPENSSL_ia32cap_X
-+
-+.globl sha1_block_data_order
-+.type sha1_block_data_order,\@function,3
-+.align 16
-+sha1_block_data_order:
-+ mov OPENSSL_ia32cap_X+0(%rip),%r9d
-+ mov OPENSSL_ia32cap_X+4(%rip),%r8d
-+ test \$`1<<9`,%r8d # check SSSE3 bit
-+ jz .Lialu
-+___
-+$code.=<<___ if ($avx);
-+ and \$`1<<28`,%r8d # mask AVX bit
-+ and \$`1<<30`,%r9d # mask "Intel CPU" bit
-+ or %r9d,%r8d
-+ cmp \$`1<<28|1<<30`,%r8d
-+ je _avx_shortcut
-+___
-+$code.=<<___;
-+ jmp _ssse3_shortcut
+-$code =~ s/#([bwd])/$1/gm;
++sub reg_part {
++my ($reg,$conv)=@_;
++ if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
++ elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
++ elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
++ elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
++ return $reg;
++}
+
++$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
++$code =~ s/\`([^\`]*)\`/eval $1/gem;
+ $code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPS} ne "");
+
+ print $code;
+diff -up openssl-1.0.0k/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0k/crypto/sha/asm/sha1-586.pl
+--- openssl-1.0.0k/crypto/sha/asm/sha1-586.pl.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/asm/sha1-586.pl 2013-02-19 21:15:39.633408143 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+
+ # ====================================================================
+ # [Re]written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
+@@ -12,6 +12,8 @@
+ # commentary below], and in 2006 the rest was rewritten in order to
+ # gain freedom to liberate licensing terms.
+
++# January, September 2004.
++#
+ # It was noted that Intel IA-32 C compiler generates code which
+ # performs ~30% *faster* on P4 CPU than original *hand-coded*
+ # SHA1 assembler implementation. To address this problem (and
+@@ -31,12 +33,92 @@
+ # ----------------------------------------------------------------
+ # <appro at fy.chalmers.se>
+
++# August 2009.
++#
++# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
++# '(c&d) + (b&(c^d))', which allows to accumulate partial results
++# and lighten "pressure" on scratch registers. This resulted in
++# >12% performance improvement on contemporary AMD cores (with no
++# degradation on other CPUs:-). Also, the code was revised to maximize
++# "distance" between instructions producing input to 'lea' instruction
++# and the 'lea' instruction itself, which is essential for Intel Atom
++# core and resulted in ~15% improvement.
+
-+.align 16
-+.Lialu:
-+ push %rbx
-+ push %rbp
-+ push %r12
-+ push %r13
-+ mov %rsp,%r11
-+ mov %rdi,$ctx # reassigned argument
-+ sub \$`8+16*4`,%rsp
-+ mov %rsi,$inp # reassigned argument
-+ and \$-64,%rsp
-+ mov %rdx,$num # reassigned argument
-+ mov %r11,`16*4`(%rsp)
-+.Lprologue:
++# October 2010.
++#
++# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
++# is to offload message schedule denoted by Wt in NIST specification,
++# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
++# and in SSE2 context was first explored by Dean Gaudet in 2004, see
++# http://arctic.org/~dean/crypto/sha1.html. Since then several things
++# have changed that made it interesting again:
++#
++# a) XMM units became faster and wider;
++# b) instruction set became more versatile;
++# c) an important observation was made by Max Locktykhin, which made
++# it possible to reduce amount of instructions required to perform
++# the operation in question, for further details see
++# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
+
-+ mov 0($ctx),$A
-+ mov 4($ctx),$B
-+ mov 8($ctx),$C
-+ mov 12($ctx),$D
-+ mov 16($ctx),$E
-+ jmp .Lloop
-
--&PROLOGUE("sha1_block_data_order");
--$code.=".align 4\n.Lloop:\n";
-+.align 16
-+.Lloop:
-+___
- for($i=0;$i<20;$i++) { &BODY_00_19($i, at V); unshift(@V,pop(@V)); }
- for(;$i<40;$i++) { &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
- for(;$i<60;$i++) { &BODY_40_59($i, at V); unshift(@V,pop(@V)); }
- for(;$i<80;$i++) { &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
- $code.=<<___;
-- add 0($ctx),$E
-- add 4($ctx),$T
-- add 8($ctx),$A
-- add 12($ctx),$B
-- add 16($ctx),$C
-- mov $E,0($ctx)
-- mov $T,4($ctx)
-- mov $A,8($ctx)
-- mov $B,12($ctx)
-- mov $C,16($ctx)
--
-- xchg $E,$A # mov $E,$A
-- xchg $T,$B # mov $T,$B
-- xchg $E,$C # mov $A,$C
-- xchg $T,$D # mov $B,$D
-- # mov $C,$E
-- lea `16*4`($inp),$inp
-+ add 0($ctx),$A
-+ add 4($ctx),$B
-+ add 8($ctx),$C
-+ add 12($ctx),$D
-+ add 16($ctx),$E
-+ mov $A,0($ctx)
-+ mov $B,4($ctx)
-+ mov $C,8($ctx)
-+ mov $D,12($ctx)
-+ mov $E,16($ctx)
++# April 2011.
++#
++# Add AVX code path, probably most controversial... The thing is that
++# switch to AVX alone improves performance by as little as 4% in
++# comparison to SSSE3 code path. But below result doesn't look like
++# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
++# pair of µ-ops, and it's the additional µ-ops, two per round, that
++# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
++# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
++# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
++# cycles per processed byte. But 'sh[rl]d' is not something that used
++# to be fast, nor does it appear to be fast in upcoming Bulldozer
++# [according to its optimization manual]. Which is why AVX code path
++# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
++# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
++# makes no sense to keep the AVX code path. If somebody feels that
++# strongly, it's probably more appropriate to discuss possibility of
++# using vector rotate XOP on AMD...
+
- sub \$1,$num
-+ lea `16*4`($inp),$inp
- jnz .Lloop
++######################################################################
++# Current performance is summarized in following table. Numbers are
++# CPU clock cycles spent to process single byte (less is better).
++#
++# x86 SSSE3 AVX
++# Pentium 15.7 -
++# PIII 11.5 -
++# P4 10.6 -
++# AMD K8 7.1 -
++# Core2 7.3 6.1/+20% -
++# Atom 12.5 9.5(*)/+32% -
++# Westmere 7.3 5.6/+30% -
++# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
++#
++# (*) Loop is 1056 instructions long and expected result is ~8.25.
++# It remains mystery [to me] why ILP is limited to 1.7.
++#
++# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
+
-+ mov `16*4`(%rsp),%rsi
-+ mov (%rsi),%r13
-+ mov 8(%rsi),%r12
-+ mov 16(%rsi),%rbp
-+ mov 24(%rsi),%rbx
-+ lea 32(%rsi),%rsp
-+.Lepilogue:
-+ ret
-+.size sha1_block_data_order,.-sha1_block_data_order
- ___
--&EPILOGUE("sha1_block_data_order");
-+{{{
-+my $Xi=4;
-+my @X=map("%xmm$_",(4..7,0..3));
-+my @Tx=map("%xmm$_",(8..10));
-+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
-+my @T=("%esi","%edi");
-+my $j=0;
-+my $K_XX_XX="%r11";
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ push(@INC,"${dir}","${dir}../../perlasm");
+ require "x86asm.pl";
+
+ &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
+
++$xmm=1; $ymm=0;
++for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
-+my $_rol=sub { &rol(@_) };
-+my $_ror=sub { &ror(@_) };
++$ymm=1 if ($xmm &&
++ `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
++ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
++ $1>=2.19); # first version supporting AVX
+
- $code.=<<___;
--.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-+.type sha1_block_data_order_ssse3,\@function,3
- .align 16
-+sha1_block_data_order_ssse3:
-+_ssse3_shortcut:
-+ push %rbx
-+ push %rbp
-+ push %r12
-+ lea `-64-($win64?5*16:0)`(%rsp),%rsp
-+___
-+$code.=<<___ if ($win64);
-+ movaps %xmm6,64+0(%rsp)
-+ movaps %xmm7,64+16(%rsp)
-+ movaps %xmm8,64+32(%rsp)
-+ movaps %xmm9,64+48(%rsp)
-+ movaps %xmm10,64+64(%rsp)
-+.Lprologue_ssse3:
-+___
-+$code.=<<___;
-+ mov %rdi,$ctx # reassigned argument
-+ mov %rsi,$inp # reassigned argument
-+ mov %rdx,$num # reassigned argument
++$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
++ $1>=2.03); # first version supporting AVX
+
-+ shl \$6,$num
-+ add $inp,$num
-+ lea K_XX_XX(%rip),$K_XX_XX
++&external_label("OPENSSL_ia32cap_X") if ($xmm);
+
-+ mov 0($ctx),$A # load context
-+ mov 4($ctx),$B
-+ mov 8($ctx),$C
-+ mov 12($ctx),$D
-+ mov $B, at T[0] # magic seed
-+ mov 16($ctx),$E
+
-+ movdqa 64($K_XX_XX), at X[2] # pbswap mask
-+ movdqa 0($K_XX_XX), at Tx[1] # K_00_19
-+ movdqu 0($inp), at X[-4&7] # load input to %xmm[0-3]
-+ movdqu 16($inp), at X[-3&7]
-+ movdqu 32($inp), at X[-2&7]
-+ movdqu 48($inp), at X[-1&7]
-+ pshufb @X[2], at X[-4&7] # byte swap
-+ add \$64,$inp
-+ pshufb @X[2], at X[-3&7]
-+ pshufb @X[2], at X[-2&7]
-+ pshufb @X[2], at X[-1&7]
-+ paddd @Tx[1], at X[-4&7] # add K_00_19
-+ paddd @Tx[1], at X[-3&7]
-+ paddd @Tx[1], at X[-2&7]
-+ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
-+ psubd @Tx[1], at X[-4&7] # restore X[]
-+ movdqa @X[-3&7],16(%rsp)
-+ psubd @Tx[1], at X[-3&7]
-+ movdqa @X[-2&7],32(%rsp)
-+ psubd @Tx[1], at X[-2&7]
-+ jmp .Loop_ssse3
-+___
+ $A="eax";
+ $B="ebx";
+ $C="ecx";
+@@ -47,6 +129,10 @@ $tmp1="ebp";
+
+ @V=($A,$B,$C,$D,$E,$T);
+
++$alt=0; # 1 denotes alternative IALU implementation, which performs
++ # 8% *worse* on P4, same on Westmere and Atom, 2% better on
++ # Sandy Bridge...
+
-+sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
-+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
-+ my $arg = pop;
-+ $arg = "\$$arg" if ($arg*1 eq $arg);
-+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+ sub BODY_00_15
+ {
+ local($n,$a,$b,$c,$d,$e,$f)=@_;
+@@ -59,16 +145,18 @@ sub BODY_00_15
+ &rotl($tmp1,5); # tmp1=ROTATE(a,5)
+ &xor($f,$d);
+ &add($tmp1,$e); # tmp1+=e;
+- &and($f,$b);
+- &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
++ &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
+ # with xi, also note that e becomes
+ # f in next round...
+- &xor($f,$d); # f holds F_00_19(b,c,d)
++ &and($f,$b);
+ &rotr($b,2); # b=ROTATE(b,30)
+- &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
++ &xor($f,$d); # f holds F_00_19(b,c,d)
++ &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
+
+- if ($n==15) { &add($f,$tmp1); } # f+=tmp1
++ if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
++ &add($f,$tmp1); } # f+=tmp1
+ else { &add($tmp1,$f); } # f becomes a in next round
++ &mov($tmp1,$a) if ($alt && $n==15);
+ }
+
+ sub BODY_16_19
+@@ -77,22 +165,41 @@ sub BODY_16_19
+
+ &comment("16_19 $n");
+
+- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+- &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
+- &xor($f,&swtmp(($n+2)%16));
+- &xor($tmp1,$d);
+- &xor($f,&swtmp(($n+8)%16));
+- &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d)
+- &rotr($b,2); # b=ROTATE(b,30)
++if ($alt) {
++ &xor($c,$d);
++ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
++ &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
++ &xor($f,&swtmp(($n+8)%16));
++ &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
++ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
++ &rotl($f,1); # f=ROTATE(f,1)
++ &add($e,$tmp1); # e+=F_00_19(b,c,d)
++ &xor($c,$d); # restore $c
++ &mov($tmp1,$a); # b in next round
++ &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
++ &mov(&swtmp($n%16),$f); # xi=f
++ &rotl($a,5); # ROTATE(a,5)
++ &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
++ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
++ &add($f,$a); # f+=ROTATE(a,5)
++} else {
++ &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
++ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
++ &xor($tmp1,$d);
++ &xor($f,&swtmp(($n+8)%16));
++ &and($tmp1,$b);
+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
+ &rotl($f,1); # f=ROTATE(f,1)
+ &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
+- &mov(&swtmp($n%16),$f); # xi=f
+- &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
+- &mov($e,$a); # e becomes volatile
+- &rotl($e,5); # e=ROTATE(a,5)
+- &add($f,$tmp1); # f+=F_00_19(b,c,d)
+- &add($f,$e); # f+=ROTATE(a,5)
++ &add($e,$tmp1); # e+=F_00_19(b,c,d)
++ &mov($tmp1,$a);
++ &rotr($b,2); # b=ROTATE(b,30)
++ &mov(&swtmp($n%16),$f); # xi=f
++ &rotl($tmp1,5); # ROTATE(a,5)
++ &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
++ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
++ &add($f,$tmp1); # f+=ROTATE(a,5)
++}
+ }
+
+ sub BODY_20_39
+@@ -102,21 +209,41 @@ sub BODY_20_39
+
+ &comment("20_39 $n");
+
++if ($alt) {
++ &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
++ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
++ &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
++ &xor($f,&swtmp(($n+8)%16));
++ &add($e,$tmp1); # e+=F_20_39(b,c,d)
++ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
++ &rotl($f,1); # f=ROTATE(f,1)
++ &mov($tmp1,$a); # b in next round
++ &rotr($b,7); # b=ROTATE(b,30)
++ &mov(&swtmp($n%16),$f) if($n<77);# xi=f
++ &rotl($a,5); # ROTATE(a,5)
++ &xor($b,$c) if($n==39);# warm up for BODY_40_59
++ &and($tmp1,$b) if($n==39);
++ &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
++ &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
++ &add($f,$a); # f+=ROTATE(a,5)
++ &rotr($a,5) if ($n==79);
++} else {
+ &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
+- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+- &rotr($b,2); # b=ROTATE(b,30)
+- &xor($f,&swtmp(($n+2)%16));
++ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+ &xor($tmp1,$c);
+ &xor($f,&swtmp(($n+8)%16));
+ &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
+ &rotl($f,1); # f=ROTATE(f,1)
+- &add($tmp1,$e);
+- &mov(&swtmp($n%16),$f); # xi=f
+- &mov($e,$a); # e becomes volatile
+- &rotl($e,5); # e=ROTATE(a,5)
+- &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e
+- &add($f,$e); # f+=ROTATE(a,5)
++ &add($e,$tmp1); # e+=F_20_39(b,c,d)
++ &rotr($b,2); # b=ROTATE(b,30)
++ &mov($tmp1,$a);
++ &rotl($tmp1,5); # ROTATE(a,5)
++ &mov(&swtmp($n%16),$f) if($n<77);# xi=f
++ &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
++ &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
++ &add($f,$tmp1); # f+=ROTATE(a,5)
+}
-+
-+sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
-+{ use integer;
-+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
-+ my ($a,$b,$c,$d,$e);
-+
-+ &movdqa (@X[0], at X[-3&7]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &movdqa (@Tx[0], at X[-1&7]);
-+ &palignr(@X[0], at X[-4&7],8); # compose "X[-14]" in "X[0]"
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &paddd (@Tx[1], at X[-1&7]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &pxor (@X[0], at X[-4&7]); # "X[0]"^="X[-16]"
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &pxor (@Tx[0], at X[-2&7]); # "X[-3]"^"X[-8]"
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &pxor (@X[0], at Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &movdqa (@Tx[2], at X[0]);
-+ &movdqa (@Tx[0], at X[0]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
-+ &paddd (@X[0], at X[0]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &psrld (@Tx[0],31);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &movdqa (@Tx[1], at Tx[2]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &psrld (@Tx[2],30);
-+ &por (@X[0], at Tx[0]); # "X[0]"<<<=1
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &pslld (@Tx[1],2);
-+ &pxor (@X[0], at Tx[2]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+
-+ &pxor (@X[0], at Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
-+
-+ foreach (@insns) { eval; } # remaining instructions [if any]
-+
-+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
-+ push(@Tx,shift(@Tx));
+ }
+
+ sub BODY_40_59
+@@ -125,41 +252,86 @@ sub BODY_40_59
+
+ &comment("40_59 $n");
+
+- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
+- &mov($tmp1,&swtmp(($n+2)%16));
+- &xor($f,$tmp1);
+- &mov($tmp1,&swtmp(($n+8)%16));
+- &xor($f,$tmp1);
+- &mov($tmp1,&swtmp(($n+13)%16));
+- &xor($f,$tmp1); # f holds xa^xb^xc^xd
+- &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d)
++if ($alt) {
++ &add($e,$tmp1); # e+=b&(c^d)
++ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
++ &mov($tmp1,$d);
++ &xor($f,&swtmp(($n+8)%16));
++ &xor($c,$d); # restore $c
++ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
+ &rotl($f,1); # f=ROTATE(f,1)
+- &or($tmp1,$c);
+- &mov(&swtmp($n%16),$f); # xi=f
+- &and($tmp1,$d);
+- &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
+- &mov($e,$b); # e becomes volatile and is used
+- # to calculate F_40_59(b,c,d)
++ &and($tmp1,$c);
++ &rotr($b,7); # b=ROTATE(b,30)
++ &add($e,$tmp1); # e+=c&d
++ &mov($tmp1,$a); # b in next round
++ &mov(&swtmp($n%16),$f); # xi=f
++ &rotl($a,5); # ROTATE(a,5)
++ &xor($b,$c) if ($n<59);
++ &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
++ &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
++ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
++ &add($f,$a); # f+=ROTATE(a,5)
++} else {
++ &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
++ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
++ &xor($tmp1,$d);
++ &xor($f,&swtmp(($n+8)%16));
++ &and($tmp1,$b);
++ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
++ &rotl($f,1); # f=ROTATE(f,1)
++ &add($tmp1,$e); # b&(c^d)+=e
+ &rotr($b,2); # b=ROTATE(b,30)
+- &and($e,$c);
+- &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d)
+- &mov($e,$a);
+- &rotl($e,5); # e=ROTATE(a,5)
+- &add($f,$tmp1); # f+=tmp1;
++ &mov($e,$a); # e becomes volatile
++ &rotl($e,5); # ROTATE(a,5)
++ &mov(&swtmp($n%16),$f); # xi=f
++ &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
++ &mov($tmp1,$c);
+ &add($f,$e); # f+=ROTATE(a,5)
++ &and($tmp1,$d);
++ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
++ &add($f,$tmp1); # f+=c&d
+}
+ }
+
+ &function_begin("sha1_block_data_order");
++if ($xmm) {
++ &static_label("ssse3_shortcut");
++ &static_label("avx_shortcut") if ($ymm);
++ &static_label("K_XX_XX");
+
-+sub Xupdate_ssse3_32_79()
-+{ use integer;
-+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
-+ my ($a,$b,$c,$d,$e);
-+
-+ &movdqa (@Tx[0], at X[-1&7]) if ($Xi==8);
-+ eval(shift(@insns)); # body_20_39
-+ &pxor (@X[0], at X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
-+ &palignr(@Tx[0], at X[-2&7],8); # compose "X[-6]"
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
++ &call (&label("pic_point")); # make it PIC!
++ &set_label("pic_point");
++ &blindpop($tmp1);
++ &picmeup($T,"OPENSSL_ia32cap_X",$tmp1,&label("pic_point"));
++ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+
-+ &pxor (@X[0], at X[-7&7]); # "X[0]"^="X[-28]"
-+ eval(shift(@insns));
-+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
-+ if ($Xi%5) {
-+ &movdqa (@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
-+ } else { # ... or load next one
-+ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
++ &mov ($A,&DWP(0,$T));
++ &mov ($D,&DWP(4,$T));
++ &test ($D,1<<9); # check SSSE3 bit
++ &jz (&label("x86"));
++ &test ($A,1<<24); # check FXSR bit
++ &jz (&label("x86"));
++ if ($ymm) {
++ &and ($D,1<<28); # mask AVX bit
++ &and ($A,1<<30); # mask "Intel CPU" bit
++ &or ($A,$D);
++ &cmp ($A,1<<28|1<<30);
++ &je (&label("avx_shortcut"));
+ }
-+ &paddd (@Tx[1], at X[-1&7]);
-+ eval(shift(@insns)); # ror
-+ eval(shift(@insns));
-+
-+ &pxor (@X[0], at Tx[0]); # "X[0]"^="X[-6]"
-+ eval(shift(@insns)); # body_20_39
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
-+
-+ &movdqa (@Tx[0], at X[0]);
-+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns)); # ror
-+ eval(shift(@insns));
-+
-+ &pslld (@X[0],2);
-+ eval(shift(@insns)); # body_20_39
-+ eval(shift(@insns));
-+ &psrld (@Tx[0],30);
-+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns)); # ror
-+ eval(shift(@insns));
-+
-+ &por (@X[0], at Tx[0]); # "X[0]"<<<=2
-+ eval(shift(@insns)); # body_20_39
-+ eval(shift(@insns));
-+ &movdqa (@Tx[1], at X[0]) if ($Xi<19);
-+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
-+ eval(shift(@insns));
-+
-+ foreach (@insns) { eval; } # remaining instructions
-+
-+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
-+ push(@Tx,shift(@Tx));
++ &jmp (&label("ssse3_shortcut"));
++ &set_label("x86",16);
+}
+ &mov($tmp1,&wparam(0)); # SHA_CTX *c
+ &mov($T,&wparam(1)); # const void *input
+ &mov($A,&wparam(2)); # size_t num
+- &stack_push(16); # allocate X[16]
++ &stack_push(16+3); # allocate X[16]
+ &shl($A,6);
+ &add($A,$T);
+ &mov(&wparam(2),$A); # pointer beyond the end of input
+ &mov($E,&DWP(16,$tmp1));# pre-load E
++ &jmp(&label("loop"));
+
+- &set_label("loop",16);
++&set_label("loop",16);
+
+ # copy input chunk to X, but reversing byte order!
+ for ($i=0; $i<16; $i+=4)
+@@ -213,8 +385,845 @@ sub BODY_40_59
+ &mov(&DWP(16,$tmp1),$C);
+ &jb(&label("loop"));
+
+- &stack_pop(16);
++ &stack_pop(16+3);
+ &function_end("sha1_block_data_order");
+
-+sub Xuplast_ssse3_80()
-+{ use integer;
-+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
-+ my ($a,$b,$c,$d,$e);
++if ($xmm) {
++######################################################################
++# The SSSE3 implementation.
++#
++# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
++# 32 elements of the message schedule or Xupdate outputs. First 4
++# quadruples are simply byte-swapped input, next 4 are calculated
++# according to method originally suggested by Dean Gaudet (modulo
++# being implemented in SSSE3). Once 8 quadruples or 32 elements are
++# collected, it switches to routine proposed by Max Locktyukhin.
++#
++# Calculations inevitably require temporary reqisters, and there are
++# no %xmm registers left to spare. For this reason part of the ring
++# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
++# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
++# X[-5], and X[4] - X[-4]...
++#
++# Another notable optimization is aggressive stack frame compression
++# aiming to minimize amount of 9-byte instructions...
++#
++# Yet another notable optimization is "jumping" $B variable. It means
++# that there is no register permanently allocated for $B value. This
++# allowed to eliminate one instruction from body_20_39...
++#
++my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
++my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
++my @V=($A,$B,$C,$D,$E);
++my $j=0; # hash round
++my @T=($T,$tmp1);
++my $inp;
+
-+ eval(shift(@insns));
-+ &paddd (@Tx[1], at X[-1&7]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
++my $_rol=sub { &rol(@_) };
++my $_ror=sub { &ror(@_) };
+
-+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer IALU
++&function_begin("_sha1_block_data_order_ssse3");
++ &call (&label("pic_point")); # make it PIC!
++ &set_label("pic_point");
++ &blindpop($tmp1);
++ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
++&set_label("ssse3_shortcut");
+
-+ foreach (@insns) { eval; } # remaining instructions
++ &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
++ &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
++ &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
++ &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
++ &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
+
-+ &cmp ($inp,$num);
-+ &je (".Ldone_ssse3");
++ &mov ($E,&wparam(0)); # load argument block
++ &mov ($inp=@T[1],&wparam(1));
++ &mov ($D,&wparam(2));
++ &mov (@T[0],"esp");
+
-+ unshift(@Tx,pop(@Tx));
++ # stack frame layout
++ #
++ # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
++ # X[4]+K X[5]+K X[6]+K X[7]+K
++ # X[8]+K X[9]+K X[10]+K X[11]+K
++ # X[12]+K X[13]+K X[14]+K X[15]+K
++ #
++ # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
++ # X[4] X[5] X[6] X[7]
++ # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
++ #
++ # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
++ # K_40_59 K_40_59 K_40_59 K_40_59
++ # K_60_79 K_60_79 K_60_79 K_60_79
++ # K_00_19 K_00_19 K_00_19 K_00_19
++ # pbswap mask
++ #
++ # +192 ctx # argument block
++ # +196 inp
++ # +200 end
++ # +204 esp
++ &sub ("esp",208);
++ &and ("esp",-64);
+
-+ &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
-+ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
-+ &movdqu (@X[-4&7],"0($inp)"); # load input
-+ &movdqu (@X[-3&7],"16($inp)");
-+ &movdqu (@X[-2&7],"32($inp)");
-+ &movdqu (@X[-1&7],"48($inp)");
-+ &pshufb (@X[-4&7], at X[2]); # byte swap
++ &movdqa (&QWP(112+0,"esp"), at X[4]); # copy constants
++ &movdqa (&QWP(112+16,"esp"), at X[5]);
++ &movdqa (&QWP(112+32,"esp"), at X[6]);
++ &shl ($D,6); # len*64
++ &movdqa (&QWP(112+48,"esp"), at X[3]);
++ &add ($D,$inp); # end of input
++ &movdqa (&QWP(112+64,"esp"), at X[2]);
+ &add ($inp,64);
++ &mov (&DWP(192+0,"esp"),$E); # save argument block
++ &mov (&DWP(192+4,"esp"),$inp);
++ &mov (&DWP(192+8,"esp"),$D);
++ &mov (&DWP(192+12,"esp"), at T[0]); # save original %esp
+
-+ $Xi=0;
-+}
++ &mov ($A,&DWP(0,$E)); # load context
++ &mov ($B,&DWP(4,$E));
++ &mov ($C,&DWP(8,$E));
++ &mov ($D,&DWP(12,$E));
++ &mov ($E,&DWP(16,$E));
++ &mov (@T[0],$B); # magic seed
+
-+sub Xloop_ssse3()
++ &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
++ &movdqu (@X[-3&7],&QWP(-48,$inp));
++ &movdqu (@X[-2&7],&QWP(-32,$inp));
++ &movdqu (@X[-1&7],&QWP(-16,$inp));
++ &pshufb (@X[-4&7], at X[2]); # byte swap
++ &pshufb (@X[-3&7], at X[2]);
++ &pshufb (@X[-2&7], at X[2]);
++ &movdqa (&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
++ &pshufb (@X[-1&7], at X[2]);
++ &paddd (@X[-4&7], at X[3]); # add K_00_19
++ &paddd (@X[-3&7], at X[3]);
++ &paddd (@X[-2&7], at X[3]);
++ &movdqa (&QWP(0,"esp"), at X[-4&7]); # X[]+K xfer to IALU
++ &psubd (@X[-4&7], at X[3]); # restore X[]
++ &movdqa (&QWP(0+16,"esp"), at X[-3&7]);
++ &psubd (@X[-3&7], at X[3]);
++ &movdqa (&QWP(0+32,"esp"), at X[-2&7]);
++ &psubd (@X[-2&7], at X[3]);
++ &movdqa (@X[0], at X[-3&7]);
++ &jmp (&label("loop"));
++
++######################################################################
++# SSE instruction sequence is first broken to groups of indepentent
++# instructions, independent in respect to their inputs and shifter
++# (not all architectures have more than one). Then IALU instructions
++# are "knitted in" between the SSE groups. Distance is maintained for
++# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
++# [which allegedly also implements SSSE3]...
++#
++# Temporary registers usage. X[2] is volatile at the entry and at the
++# end is restored from backtrace ring buffer. X[3] is expected to
++# contain current K_XX_XX constant and is used to caclulate X[-1]+K
++# from previous round, it becomes volatile the moment the value is
++# saved to stack for transfer to IALU. X[4] becomes volatile whenever
++# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
++# end it is loaded with next K_XX_XX [which becomes X[3] in next
++# round]...
++#
++sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &pshufb (@X[($Xi-3)&7], at X[2]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &paddd (@X[($Xi-4)&7], at Tx[1]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &movdqa (eval(16*$Xi)."(%rsp)", at X[($Xi-4)&7]); # X[]+K xfer to IALU
++ &palignr(@X[0], at X[-4&7],8); # compose "X[-14]" in "X[0]"
++ &movdqa (@X[2], at X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &psubd (@X[($Xi-4)&7], at Tx[1]);
-+
-+ foreach (@insns) { eval; }
-+ $Xi++;
-+}
-+
-+sub Xtail_ssse3()
-+{ use integer;
-+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
-+ my ($a,$b,$c,$d,$e);
-+
-+ foreach (@insns) { eval; }
-+}
-+
-+sub body_00_19 () {
-+ (
-+ '($a,$b,$c,$d,$e)=@V;'.
-+ '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
-+ '&xor ($c,$d);',
-+ '&mov (@T[1],$a);', # $b in next round
-+ '&$_rol ($a,5);',
-+ '&and (@T[0],$c);', # ($b&($c^$d))
-+ '&xor ($c,$d);', # restore $c
-+ '&xor (@T[0],$d);',
-+ '&add ($e,$a);',
-+ '&$_ror ($b,$j?7:2);', # $b>>>2
-+ '&add ($e, at T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
-+ );
-+}
-+
-+sub body_20_39 () {
-+ (
-+ '($a,$b,$c,$d,$e)=@V;'.
-+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
-+ '&xor (@T[0],$d);', # ($b^$d)
-+ '&mov (@T[1],$a);', # $b in next round
-+ '&$_rol ($a,5);',
-+ '&xor (@T[0],$c);', # ($b^$d^$c)
-+ '&add ($e,$a);',
-+ '&$_ror ($b,7);', # $b>>>2
-+ '&add ($e, at T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
-+ );
-+}
-+
-+sub body_40_59 () {
-+ (
-+ '($a,$b,$c,$d,$e)=@V;'.
-+ '&mov (@T[1],$c);',
-+ '&xor ($c,$d);',
-+ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
-+ '&and (@T[1],$d);',
-+ '&and (@T[0],$c);', # ($b&($c^$d))
-+ '&$_ror ($b,7);', # $b>>>2
-+ '&add ($e, at T[1]);',
-+ '&mov (@T[1],$a);', # $b in next round
-+ '&$_rol ($a,5);',
-+ '&add ($e, at T[0]);',
-+ '&xor ($c,$d);', # restore $c
-+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
-+ );
-+}
-+$code.=<<___;
-+.align 16
-+.Loop_ssse3:
-+___
-+ &Xupdate_ssse3_16_31(\&body_00_19);
-+ &Xupdate_ssse3_16_31(\&body_00_19);
-+ &Xupdate_ssse3_16_31(\&body_00_19);
-+ &Xupdate_ssse3_16_31(\&body_00_19);
-+ &Xupdate_ssse3_32_79(\&body_00_19);
-+ &Xupdate_ssse3_32_79(\&body_20_39);
-+ &Xupdate_ssse3_32_79(\&body_20_39);
-+ &Xupdate_ssse3_32_79(\&body_20_39);
-+ &Xupdate_ssse3_32_79(\&body_20_39);
-+ &Xupdate_ssse3_32_79(\&body_20_39);
-+ &Xupdate_ssse3_32_79(\&body_40_59);
-+ &Xupdate_ssse3_32_79(\&body_40_59);
-+ &Xupdate_ssse3_32_79(\&body_40_59);
-+ &Xupdate_ssse3_32_79(\&body_40_59);
-+ &Xupdate_ssse3_32_79(\&body_40_59);
-+ &Xupdate_ssse3_32_79(\&body_20_39);
-+ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
-+
-+ $saved_j=$j; @saved_V=@V;
+
-+ &Xloop_ssse3(\&body_20_39);
-+ &Xloop_ssse3(\&body_20_39);
-+ &Xloop_ssse3(\&body_20_39);
++ &paddd (@X[3], at X[-1&7]);
++ &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &psrldq (@X[2],4); # "X[-3]", 3 dwords
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &pxor (@X[0], at X[-4&7]); # "X[0]"^="X[-16]"
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+$code.=<<___;
-+ add 0($ctx),$A # update context
-+ add 4($ctx), at T[0]
-+ add 8($ctx),$C
-+ add 12($ctx),$D
-+ mov $A,0($ctx)
-+ add 16($ctx),$E
-+ mov @T[0],4($ctx)
-+ mov @T[0],$B # magic seed
-+ mov $C,8($ctx)
-+ mov $D,12($ctx)
-+ mov $E,16($ctx)
-+ jmp .Loop_ssse3
++ &pxor (@X[2], at X[-2&7]); # "X[-3]"^"X[-8]"
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+.align 16
-+.Ldone_ssse3:
-+___
-+ $j=$saved_j; @V=@saved_V;
++ &pxor (@X[0], at X[2]); # "X[0]"^="X[-3]"^"X[-8]"
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+ &Xtail_ssse3(\&body_20_39);
-+ &Xtail_ssse3(\&body_20_39);
-+ &Xtail_ssse3(\&body_20_39);
++ &movdqa (@X[4], at X[0]);
++ &movdqa (@X[2], at X[0]);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+$code.=<<___;
-+ add 0($ctx),$A # update context
-+ add 4($ctx), at T[0]
-+ add 8($ctx),$C
-+ mov $A,0($ctx)
-+ add 12($ctx),$D
-+ mov @T[0],4($ctx)
-+ add 16($ctx),$E
-+ mov $C,8($ctx)
-+ mov $D,12($ctx)
-+ mov $E,16($ctx)
-+___
-+$code.=<<___ if ($win64);
-+ movaps 64+0(%rsp),%xmm6
-+ movaps 64+16(%rsp),%xmm7
-+ movaps 64+32(%rsp),%xmm8
-+ movaps 64+48(%rsp),%xmm9
-+ movaps 64+64(%rsp),%xmm10
-+___
-+$code.=<<___;
-+ lea `64+($win64?6*16:0)`(%rsp),%rsi
-+ mov 0(%rsi),%r12
-+ mov 8(%rsi),%rbp
-+ mov 16(%rsi),%rbx
-+ lea 24(%rsi),%rsp
-+.Lepilogue_ssse3:
-+ ret
-+.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
-+___
++ &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
++ &paddd (@X[0], at X[0]);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+if ($avx) {
-+my $Xi=4;
-+my @X=map("%xmm$_",(4..7,0..3));
-+my @Tx=map("%xmm$_",(8..10));
-+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
-+my @T=("%esi","%edi");
-+my $j=0;
-+my $K_XX_XX="%r11";
++ &psrld (@X[2],31);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &movdqa (@X[3], at X[4]);
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+my $_rol=sub { &shld(@_[0], at _) };
-+my $_ror=sub { &shrd(@_[0], at _) };
++ &psrld (@X[4],30);
++ &por (@X[0], at X[2]); # "X[0]"<<<=1
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+$code.=<<___;
-+.type sha1_block_data_order_avx,\@function,3
-+.align 16
-+sha1_block_data_order_avx:
-+_avx_shortcut:
-+ push %rbx
-+ push %rbp
-+ push %r12
-+ lea `-64-($win64?5*16:0)`(%rsp),%rsp
-+___
-+$code.=<<___ if ($win64);
-+ movaps %xmm6,64+0(%rsp)
-+ movaps %xmm7,64+16(%rsp)
-+ movaps %xmm8,64+32(%rsp)
-+ movaps %xmm9,64+48(%rsp)
-+ movaps %xmm10,64+64(%rsp)
-+.Lprologue_avx:
-+___
-+$code.=<<___;
-+ mov %rdi,$ctx # reassigned argument
-+ mov %rsi,$inp # reassigned argument
-+ mov %rdx,$num # reassigned argument
-+ vzeroall
++ &pslld (@X[3],2);
++ &pxor (@X[0], at X[4]);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+ shl \$6,$num
-+ add $inp,$num
-+ lea K_XX_XX(%rip),$K_XX_XX
++ &pxor (@X[0], at X[3]); # "X[0]"^=("X[0]"<<96)<<<2
++ &movdqa (@X[1], at X[-2&7]) if ($Xi<7);
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+ mov 0($ctx),$A # load context
-+ mov 4($ctx),$B
-+ mov 8($ctx),$C
-+ mov 12($ctx),$D
-+ mov $B, at T[0] # magic seed
-+ mov 16($ctx),$E
++ foreach (@insns) { eval; } # remaining instructions [if any]
+
-+ vmovdqa 64($K_XX_XX), at X[2] # pbswap mask
-+ vmovdqa 0($K_XX_XX), at Tx[1] # K_00_19
-+ vmovdqu 0($inp), at X[-4&7] # load input to %xmm[0-3]
-+ vmovdqu 16($inp), at X[-3&7]
-+ vmovdqu 32($inp), at X[-2&7]
-+ vmovdqu 48($inp), at X[-1&7]
-+ vpshufb @X[2], at X[-4&7], at X[-4&7] # byte swap
-+ add \$64,$inp
-+ vpshufb @X[2], at X[-3&7], at X[-3&7]
-+ vpshufb @X[2], at X[-2&7], at X[-2&7]
-+ vpshufb @X[2], at X[-1&7], at X[-1&7]
-+ vpaddd @Tx[1], at X[-4&7], at X[0] # add K_00_19
-+ vpaddd @Tx[1], at X[-3&7], at X[1]
-+ vpaddd @Tx[1], at X[-2&7], at X[2]
-+ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
-+ vmovdqa @X[1],16(%rsp)
-+ vmovdqa @X[2],32(%rsp)
-+ jmp .Loop_avx
-+___
++ $Xi++; push(@X,shift(@X)); # "rotate" X[]
++}
+
-+sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
++sub Xupdate_ssse3_32_79()
+{ use integer;
+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
++ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
++ &movdqa (@X[2], at X[-1&7]) if ($Xi==8);
++ eval(shift(@insns)); # body_20_39
++ &pxor (@X[0], at X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
++ &palignr(@X[2], at X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpalignr(@X[0], at X[-3&7], at X[-4&7],8); # compose "X[-14]" in "X[0]"
++ eval(shift(@insns)); # rol
++
++ &pxor (@X[0], at X[-7&7]); # "X[0]"^="X[-28]"
++ &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]); # save X[] to backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
++ if ($Xi%5) {
++ &movdqa (@X[4], at X[3]); # "perpetuate" K_XX_XX...
++ } else { # ... or load next one
++ &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
++ }
++ &paddd (@X[3], at X[-1&7]);
++ eval(shift(@insns)); # ror
++ eval(shift(@insns));
+
-+ &vpaddd (@Tx[1], at Tx[1], at X[-1&7]);
++ &pxor (@X[0], at X[2]); # "X[0]"^="X[-6]"
++ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpsrldq(@Tx[0], at X[-1&7],4); # "X[-3]", 3 dwords
++ eval(shift(@insns)); # rol
++
++ &movdqa (@X[2], at X[0]);
++ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpxor (@X[0], at X[0], at X[-4&7]); # "X[0]"^="X[-16]"
++ eval(shift(@insns)); # ror
++ eval(shift(@insns));
++
++ &pslld (@X[0],2);
++ eval(shift(@insns)); # body_20_39
++ eval(shift(@insns));
++ &psrld (@X[2],30);
++ eval(shift(@insns));
++ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
++ eval(shift(@insns)); # ror
++ eval(shift(@insns));
+
-+ &vpxor (@Tx[0], at Tx[0], at X[-2&7]); # "X[-3]"^"X[-8]"
++ &por (@X[0], at X[2]); # "X[0]"<<<=2
++ eval(shift(@insns)); # body_20_39
++ eval(shift(@insns));
++ &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
++ eval(shift(@insns));
++ eval(shift(@insns)); # rol
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns)); # ror
++ &movdqa (@X[3], at X[0]) if ($Xi<19);
++ eval(shift(@insns));
++
++ foreach (@insns) { eval; } # remaining instructions
++
++ $Xi++; push(@X,shift(@X)); # "rotate" X[]
++}
++
++sub Xuplast_ssse3_80()
++{ use integer;
++ my $body = shift;
++ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my ($a,$b,$c,$d,$e);
++
++ eval(shift(@insns));
++ &paddd (@X[3], at X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpxor (@X[0], at X[0], at Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
-+ eval(shift(@insns));
-+ eval(shift(@insns));
++ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer IALU
++
++ foreach (@insns) { eval; } # remaining instructions
++
++ &mov ($inp=@T[1],&DWP(192+4,"esp"));
++ &cmp ($inp,&DWP(192+8,"esp"));
++ &je (&label("done"));
++
++ &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
++ &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
++ &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
++ &movdqu (@X[-3&7],&QWP(16,$inp));
++ &movdqu (@X[-2&7],&QWP(32,$inp));
++ &movdqu (@X[-1&7],&QWP(48,$inp));
++ &add ($inp,64);
++ &pshufb (@X[-4&7], at X[2]); # byte swap
++ &mov (&DWP(192+4,"esp"),$inp);
++ &movdqa (&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
++
++ $Xi=0;
++}
++
++sub Xloop_ssse3()
++{ use integer;
++ my $body = shift;
++ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my ($a,$b,$c,$d,$e);
+
-+ &vpsrld (@Tx[0], at X[0],31);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
-+
-+ &vpslldq(@Tx[2], at X[0],12); # "X[0]"<<96, extract one dword
-+ &vpaddd (@X[0], at X[0], at X[0]);
++ &pshufb (@X[($Xi-3)&7], at X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
++ &paddd (@X[($Xi-4)&7], at X[3]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+
-+ &vpsrld (@Tx[1], at Tx[2],30);
-+ &vpor (@X[0], at X[0], at Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
++ &movdqa (&QWP(0+16*$Xi,"esp"), at X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
++ &psubd (@X[($Xi-4)&7], at X[3]);
+
-+ &vpslld (@Tx[2], at Tx[2],2);
-+ &vpxor (@X[0], at X[0], at Tx[1]);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ eval(shift(@insns));
++ foreach (@insns) { eval; }
++ $Xi++;
++}
+
-+ &vpxor (@X[0], at X[0], at Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
-+ eval(shift(@insns));
-+ eval(shift(@insns));
++sub Xtail_ssse3()
++{ use integer;
++ my $body = shift;
++ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my ($a,$b,$c,$d,$e);
+
++ foreach (@insns) { eval; }
++}
+
-+ foreach (@insns) { eval; } # remaining instructions [if any]
++sub body_00_19 () {
++ (
++ '($a,$b,$c,$d,$e)=@V;'.
++ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
++ '&xor ($c,$d);',
++ '&mov (@T[1],$a);', # $b in next round
++ '&$_rol ($a,5);',
++ '&and (@T[0],$c);', # ($b&($c^$d))
++ '&xor ($c,$d);', # restore $c
++ '&xor (@T[0],$d);',
++ '&add ($e,$a);',
++ '&$_ror ($b,$j?7:2);', # $b>>>2
++ '&add ($e, at T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
++ );
++}
+
-+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
-+ push(@Tx,shift(@Tx));
++sub body_20_39 () {
++ (
++ '($a,$b,$c,$d,$e)=@V;'.
++ '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
++ '&xor (@T[0],$d);', # ($b^$d)
++ '&mov (@T[1],$a);', # $b in next round
++ '&$_rol ($a,5);',
++ '&xor (@T[0],$c);', # ($b^$d^$c)
++ '&add ($e,$a);',
++ '&$_ror ($b,7);', # $b>>>2
++ '&add ($e, at T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
++ );
+}
+
-+sub Xupdate_avx_32_79()
++sub body_40_59 () {
++ (
++ '($a,$b,$c,$d,$e)=@V;'.
++ '&mov (@T[1],$c);',
++ '&xor ($c,$d);',
++ '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
++ '&and (@T[1],$d);',
++ '&and (@T[0],$c);', # ($b&($c^$d))
++ '&$_ror ($b,7);', # $b>>>2
++ '&add ($e, at T[1]);',
++ '&mov (@T[1],$a);', # $b in next round
++ '&$_rol ($a,5);',
++ '&add ($e, at T[0]);',
++ '&xor ($c,$d);', # restore $c
++ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
++ );
++}
++
++&set_label("loop",16);
++ &Xupdate_ssse3_16_31(\&body_00_19);
++ &Xupdate_ssse3_16_31(\&body_00_19);
++ &Xupdate_ssse3_16_31(\&body_00_19);
++ &Xupdate_ssse3_16_31(\&body_00_19);
++ &Xupdate_ssse3_32_79(\&body_00_19);
++ &Xupdate_ssse3_32_79(\&body_20_39);
++ &Xupdate_ssse3_32_79(\&body_20_39);
++ &Xupdate_ssse3_32_79(\&body_20_39);
++ &Xupdate_ssse3_32_79(\&body_20_39);
++ &Xupdate_ssse3_32_79(\&body_20_39);
++ &Xupdate_ssse3_32_79(\&body_40_59);
++ &Xupdate_ssse3_32_79(\&body_40_59);
++ &Xupdate_ssse3_32_79(\&body_40_59);
++ &Xupdate_ssse3_32_79(\&body_40_59);
++ &Xupdate_ssse3_32_79(\&body_40_59);
++ &Xupdate_ssse3_32_79(\&body_20_39);
++ &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
++
++ $saved_j=$j; @saved_V=@V;
++
++ &Xloop_ssse3(\&body_20_39);
++ &Xloop_ssse3(\&body_20_39);
++ &Xloop_ssse3(\&body_20_39);
++
++ &mov (@T[1],&DWP(192,"esp")); # update context
++ &add ($A,&DWP(0, at T[1]));
++ &add (@T[0],&DWP(4, at T[1])); # $b
++ &add ($C,&DWP(8, at T[1]));
++ &mov (&DWP(0, at T[1]),$A);
++ &add ($D,&DWP(12, at T[1]));
++ &mov (&DWP(4, at T[1]), at T[0]);
++ &add ($E,&DWP(16, at T[1]));
++ &mov (&DWP(8, at T[1]),$C);
++ &mov ($B, at T[0]);
++ &mov (&DWP(12, at T[1]),$D);
++ &mov (&DWP(16, at T[1]),$E);
++ &movdqa (@X[0], at X[-3&7]);
++
++ &jmp (&label("loop"));
++
++&set_label("done",16); $j=$saved_j; @V=@saved_V;
++
++ &Xtail_ssse3(\&body_20_39);
++ &Xtail_ssse3(\&body_20_39);
++ &Xtail_ssse3(\&body_20_39);
++
++ &mov (@T[1],&DWP(192,"esp")); # update context
++ &add ($A,&DWP(0, at T[1]));
++ &mov ("esp",&DWP(192+12,"esp")); # restore %esp
++ &add (@T[0],&DWP(4, at T[1])); # $b
++ &add ($C,&DWP(8, at T[1]));
++ &mov (&DWP(0, at T[1]),$A);
++ &add ($D,&DWP(12, at T[1]));
++ &mov (&DWP(4, at T[1]), at T[0]);
++ &add ($E,&DWP(16, at T[1]));
++ &mov (&DWP(8, at T[1]),$C);
++ &mov (&DWP(12, at T[1]),$D);
++ &mov (&DWP(16, at T[1]),$E);
++
++&function_end("_sha1_block_data_order_ssse3");
++
++if ($ymm) {
++my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
++my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
++my @V=($A,$B,$C,$D,$E);
++my $j=0; # hash round
++my @T=($T,$tmp1);
++my $inp;
++
++my $_rol=sub { &shld(@_[0], at _) };
++my $_ror=sub { &shrd(@_[0], at _) };
++
++&function_begin("_sha1_block_data_order_avx");
++ &call (&label("pic_point")); # make it PIC!
++ &set_label("pic_point");
++ &blindpop($tmp1);
++ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
++&set_label("avx_shortcut");
++ &vzeroall();
++
++ &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
++ &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
++ &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
++ &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
++ &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
++
++ &mov ($E,&wparam(0)); # load argument block
++ &mov ($inp=@T[1],&wparam(1));
++ &mov ($D,&wparam(2));
++ &mov (@T[0],"esp");
++
++ # stack frame layout
++ #
++ # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
++ # X[4]+K X[5]+K X[6]+K X[7]+K
++ # X[8]+K X[9]+K X[10]+K X[11]+K
++ # X[12]+K X[13]+K X[14]+K X[15]+K
++ #
++ # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
++ # X[4] X[5] X[6] X[7]
++ # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
++ #
++ # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
++ # K_40_59 K_40_59 K_40_59 K_40_59
++ # K_60_79 K_60_79 K_60_79 K_60_79
++ # K_00_19 K_00_19 K_00_19 K_00_19
++ # pbswap mask
++ #
++ # +192 ctx # argument block
++ # +196 inp
++ # +200 end
++ # +204 esp
++ &sub ("esp",208);
++ &and ("esp",-64);
++
++ &vmovdqa(&QWP(112+0,"esp"), at X[4]); # copy constants
++ &vmovdqa(&QWP(112+16,"esp"), at X[5]);
++ &vmovdqa(&QWP(112+32,"esp"), at X[6]);
++ &shl ($D,6); # len*64
++ &vmovdqa(&QWP(112+48,"esp"), at X[3]);
++ &add ($D,$inp); # end of input
++ &vmovdqa(&QWP(112+64,"esp"), at X[2]);
++ &add ($inp,64);
++ &mov (&DWP(192+0,"esp"),$E); # save argument block
++ &mov (&DWP(192+4,"esp"),$inp);
++ &mov (&DWP(192+8,"esp"),$D);
++ &mov (&DWP(192+12,"esp"), at T[0]); # save original %esp
++
++ &mov ($A,&DWP(0,$E)); # load context
++ &mov ($B,&DWP(4,$E));
++ &mov ($C,&DWP(8,$E));
++ &mov ($D,&DWP(12,$E));
++ &mov ($E,&DWP(16,$E));
++ &mov (@T[0],$B); # magic seed
++
++ &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
++ &vmovdqu(@X[-3&7],&QWP(-48,$inp));
++ &vmovdqu(@X[-2&7],&QWP(-32,$inp));
++ &vmovdqu(@X[-1&7],&QWP(-16,$inp));
++ &vpshufb(@X[-4&7], at X[-4&7], at X[2]); # byte swap
++ &vpshufb(@X[-3&7], at X[-3&7], at X[2]);
++ &vpshufb(@X[-2&7], at X[-2&7], at X[2]);
++ &vmovdqa(&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
++ &vpshufb(@X[-1&7], at X[-1&7], at X[2]);
++ &vpaddd (@X[0], at X[-4&7], at X[3]); # add K_00_19
++ &vpaddd (@X[1], at X[-3&7], at X[3]);
++ &vpaddd (@X[2], at X[-2&7], at X[3]);
++ &vmovdqa(&QWP(0,"esp"), at X[0]); # X[]+K xfer to IALU
++ &vmovdqa(&QWP(0+16,"esp"), at X[1]);
++ &vmovdqa(&QWP(0+32,"esp"), at X[2]);
++ &jmp (&label("loop"));
++
++sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
++ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
-+ &vpalignr(@Tx[0], at X[-1&7], at X[-2&7],8); # compose "X[-6]"
-+ &vpxor (@X[0], at X[0], at X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
-+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
-+
-+ &vpxor (@X[0], at X[0], at X[-7&7]); # "X[0]"^="X[-28]"
++ &vpalignr(@X[0], at X[-3&7], at X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
-+ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
-+ if ($Xi%5) {
-+ &vmovdqa (@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
-+ } else { # ... or load next one
-+ &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
-+ }
-+ &vpaddd (@Tx[1], at Tx[1], at X[-1&7]);
-+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
-+ &vpxor (@X[0], at X[0], at Tx[0]); # "X[0]"^="X[-6]"
-+ eval(shift(@insns)); # body_20_39
++ &vpaddd (@X[3], at X[3], at X[-1&7]);
++ &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
-+
-+ &vpsrld (@Tx[0], at X[0],30);
-+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
++ &vpsrldq(@X[2], at X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ eval(shift(@insns)); # ror
++ &vpxor (@X[0], at X[0], at X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
-+
-+ &vpslld (@X[0], at X[0],2);
-+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
++
++ &vpxor (@X[2], at X[2], at X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
++ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
+ eval(shift(@insns));
-+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
-+ &vpor (@X[0], at X[0], at Tx[0]); # "X[0]"<<<=2
-+ eval(shift(@insns)); # body_20_39
-+ eval(shift(@insns));
-+ &vmovdqa (@Tx[1], at X[0]) if ($Xi<19);
++ &vpxor (@X[0], at X[0], at X[2]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
-+ foreach (@insns) { eval; } # remaining instructions
-+
-+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
-+ push(@Tx,shift(@Tx));
-+}
-+
-+sub Xuplast_avx_80()
-+{ use integer;
-+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
-+ my ($a,$b,$c,$d,$e);
-+
-+ eval(shift(@insns));
-+ &vpaddd (@Tx[1], at Tx[1], at X[-1&7]);
++ &vpsrld (@X[2], at X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer IALU
-+
-+ foreach (@insns) { eval; } # remaining instructions
-+
-+ &cmp ($inp,$num);
-+ &je (".Ldone_avx");
-+
-+ unshift(@Tx,pop(@Tx));
-+
-+ &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
-+ &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
-+ &vmovdqu(@X[-4&7],"0($inp)"); # load input
-+ &vmovdqu(@X[-3&7],"16($inp)");
-+ &vmovdqu(@X[-2&7],"32($inp)");
-+ &vmovdqu(@X[-1&7],"48($inp)");
-+ &vpshufb(@X[-4&7], at X[-4&7], at X[2]); # byte swap
-+ &add ($inp,64);
-+
-+ $Xi=0;
-+}
++ &vpslldq(@X[4], at X[0],12); # "X[0]"<<96, extract one dword
++ &vpaddd (@X[0], at X[0], at X[0]);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+sub Xloop_avx()
-+{ use integer;
-+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
-+ my ($a,$b,$c,$d,$e);
++ &vpsrld (@X[3], at X[4],30);
++ &vpor (@X[0], at X[0], at X[2]); # "X[0]"<<<=1
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
+
++ &vpslld (@X[4], at X[4],2);
++ &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpshufb(@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
++ &vpxor (@X[0], at X[0], at X[3]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpaddd (@X[$Xi&7], at X[($Xi-4)&7], at Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
++
++ &vpxor (@X[0], at X[0], at X[4]); # "X[0]"^=("X[0]"<<96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vmovdqa(eval(16*$Xi)."(%rsp)", at X[$Xi&7]); # X[]+K xfer to IALU
++ &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ foreach (@insns) { eval; }
-+ $Xi++;
++ foreach (@insns) { eval; } # remaining instructions [if any]
++
++ $Xi++; push(@X,shift(@X)); # "rotate" X[]
+}
+
-+sub Xtail_avx()
++sub Xupdate_avx_32_79()
+{ use integer;
+ my $body = shift;
-+ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
-+ foreach (@insns) { eval; }
-+}
-+
-+$code.=<<___;
-+.align 16
-+.Loop_avx:
-+___
-+ &Xupdate_avx_16_31(\&body_00_19);
-+ &Xupdate_avx_16_31(\&body_00_19);
-+ &Xupdate_avx_16_31(\&body_00_19);
-+ &Xupdate_avx_16_31(\&body_00_19);
-+ &Xupdate_avx_32_79(\&body_00_19);
-+ &Xupdate_avx_32_79(\&body_20_39);
-+ &Xupdate_avx_32_79(\&body_20_39);
-+ &Xupdate_avx_32_79(\&body_20_39);
-+ &Xupdate_avx_32_79(\&body_20_39);
-+ &Xupdate_avx_32_79(\&body_20_39);
-+ &Xupdate_avx_32_79(\&body_40_59);
-+ &Xupdate_avx_32_79(\&body_40_59);
-+ &Xupdate_avx_32_79(\&body_40_59);
-+ &Xupdate_avx_32_79(\&body_40_59);
-+ &Xupdate_avx_32_79(\&body_40_59);
-+ &Xupdate_avx_32_79(\&body_20_39);
-+ &Xuplast_avx_80(\&body_20_39); # can jump to "done"
-+
-+ $saved_j=$j; @saved_V=@V;
-+
-+ &Xloop_avx(\&body_20_39);
-+ &Xloop_avx(\&body_20_39);
-+ &Xloop_avx(\&body_20_39);
-+
-+$code.=<<___;
-+ add 0($ctx),$A # update context
-+ add 4($ctx), at T[0]
-+ add 8($ctx),$C
-+ add 12($ctx),$D
-+ mov $A,0($ctx)
-+ add 16($ctx),$E
-+ mov @T[0],4($ctx)
-+ mov @T[0],$B # magic seed
-+ mov $C,8($ctx)
-+ mov $D,12($ctx)
-+ mov $E,16($ctx)
-+ jmp .Loop_avx
-+
-+.align 16
-+.Ldone_avx:
-+___
-+ $j=$saved_j; @V=@saved_V;
++ &vpalignr(@X[2], at X[-1&7], at X[-2&7],8); # compose "X[-6]"
++ &vpxor (@X[0], at X[0], at X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
++ eval(shift(@insns)); # body_20_39
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns)); # rol
+
-+ &Xtail_avx(\&body_20_39);
-+ &Xtail_avx(\&body_20_39);
-+ &Xtail_avx(\&body_20_39);
++ &vpxor (@X[0], at X[0], at X[-7&7]); # "X[0]"^="X[-28]"
++ &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]); # save X[] to backtrace buffer
++ eval(shift(@insns));
++ eval(shift(@insns));
++ if ($Xi%5) {
++ &vmovdqa (@X[4], at X[3]); # "perpetuate" K_XX_XX...
++ } else { # ... or load next one
++ &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
++ }
++ &vpaddd (@X[3], at X[3], at X[-1&7]);
++ eval(shift(@insns)); # ror
++ eval(shift(@insns));
+
-+$code.=<<___;
-+ vzeroall
++ &vpxor (@X[0], at X[0], at X[2]); # "X[0]"^="X[-6]"
++ eval(shift(@insns)); # body_20_39
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns)); # rol
+
-+ add 0($ctx),$A # update context
-+ add 4($ctx), at T[0]
-+ add 8($ctx),$C
-+ mov $A,0($ctx)
-+ add 12($ctx),$D
-+ mov @T[0],4($ctx)
-+ add 16($ctx),$E
-+ mov $C,8($ctx)
-+ mov $D,12($ctx)
-+ mov $E,16($ctx)
-+___
-+$code.=<<___ if ($win64);
-+ movaps 64+0(%rsp),%xmm6
-+ movaps 64+16(%rsp),%xmm7
-+ movaps 64+32(%rsp),%xmm8
-+ movaps 64+48(%rsp),%xmm9
-+ movaps 64+64(%rsp),%xmm10
-+___
-+$code.=<<___;
-+ lea `64+($win64?6*16:0)`(%rsp),%rsi
-+ mov 0(%rsi),%r12
-+ mov 8(%rsi),%rbp
-+ mov 16(%rsi),%rbx
-+ lea 24(%rsi),%rsp
-+.Lepilogue_avx:
-+ ret
-+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
-+___
-+}
-+$code.=<<___;
-+.align 64
-+K_XX_XX:
-+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
-+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
-+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
-+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
-+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
-+___
-+}}}
-+$code.=<<___;
-+.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-+.align 64
- ___
-
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-@@ -272,25 +1109,73 @@ se_handler:
-
- lea .Lprologue(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
-- jb .Lin_prologue
-+ jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- lea .Lepilogue(%rip),%r10
- cmp %r10,%rbx # context->Rip>=.Lepilogue
-- jae .Lin_prologue
-+ jae .Lcommon_seh_tail
-
- mov `16*4`(%rax),%rax # pull saved stack pointer
-- lea 24(%rax),%rax
-+ lea 32(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
-+ mov -32(%rax),%r13
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
-+ mov %r13,224($context) # restore context->R13
++ &vpsrld (@X[2], at X[0],30);
++ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns)); # ror
++ eval(shift(@insns));
+
-+ jmp .Lcommon_seh_tail
-+.size se_handler,.-se_handler
++ &vpslld (@X[0], at X[0],2);
++ eval(shift(@insns)); # body_20_39
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns)); # rol
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns)); # ror
++ eval(shift(@insns));
+
-+.type ssse3_handler,\@abi-omnipotent
-+.align 16
-+ssse3_handler:
-+ push %rsi
-+ push %rdi
-+ push %rbx
-+ push %rbp
-+ push %r12
-+ push %r13
-+ push %r14
-+ push %r15
-+ pushfq
-+ sub \$64,%rsp
++ &vpor (@X[0], at X[0], at X[2]); # "X[0]"<<<=2
++ eval(shift(@insns)); # body_20_39
++ eval(shift(@insns));
++ &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
++ eval(shift(@insns));
++ eval(shift(@insns)); # rol
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns)); # ror
++ eval(shift(@insns));
+
-+ mov 120($context),%rax # pull context->Rax
-+ mov 248($context),%rbx # pull context->Rip
++ foreach (@insns) { eval; } # remaining instructions
+
-+ mov 8($disp),%rsi # disp->ImageBase
-+ mov 56($disp),%r11 # disp->HandlerData
++ $Xi++; push(@X,shift(@X)); # "rotate" X[]
++}
+
-+ mov 0(%r11),%r10d # HandlerData[0]
-+ lea (%rsi,%r10),%r10 # prologue label
-+ cmp %r10,%rbx # context->Rip<prologue label
-+ jb .Lcommon_seh_tail
++sub Xuplast_avx_80()
++{ use integer;
++ my $body = shift;
++ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my ($a,$b,$c,$d,$e);
+
-+ mov 152($context),%rax # pull context->Rsp
-
--.Lin_prologue:
-+ mov 4(%r11),%r10d # HandlerData[1]
-+ lea (%rsi,%r10),%r10 # epilogue label
-+ cmp %r10,%rbx # context->Rip>=epilogue label
-+ jae .Lcommon_seh_tail
++ eval(shift(@insns));
++ &vpaddd (@X[3], at X[3], at X[-1&7]);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
+
-+ lea 64(%rax),%rsi
-+ lea 512($context),%rdi # &context.Xmm6
-+ mov \$10,%ecx
-+ .long 0xa548f3fc # cld; rep movsq
-+ lea 24+5*16(%rax),%rax # adjust stack pointer
++ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer IALU
+
-+ mov -8(%rax),%rbx
-+ mov -16(%rax),%rbp
-+ mov %rbx,144($context) # restore context->Rbx
-+ mov %rbp,160($context) # restore context->Rbp
++ foreach (@insns) { eval; } # remaining instructions
+
-+.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
-@@ -328,19 +1213,38 @@ se_handler:
- pop %rdi
- pop %rsi
- ret
--.size se_handler,.-se_handler
-+.size ssse3_handler,.-ssse3_handler
-
- .section .pdata
- .align 4
- .rva .LSEH_begin_sha1_block_data_order
- .rva .LSEH_end_sha1_block_data_order
- .rva .LSEH_info_sha1_block_data_order
--
-+ .rva .LSEH_begin_sha1_block_data_order_ssse3
-+ .rva .LSEH_end_sha1_block_data_order_ssse3
-+ .rva .LSEH_info_sha1_block_data_order_ssse3
-+___
-+$code.=<<___ if ($avx);
-+ .rva .LSEH_begin_sha1_block_data_order_avx
-+ .rva .LSEH_end_sha1_block_data_order_avx
-+ .rva .LSEH_info_sha1_block_data_order_avx
-+___
-+$code.=<<___;
- .section .xdata
- .align 8
- .LSEH_info_sha1_block_data_order:
- .byte 9,0,0,0
- .rva se_handler
-+.LSEH_info_sha1_block_data_order_ssse3:
-+ .byte 9,0,0,0
-+ .rva ssse3_handler
-+ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
-+___
-+$code.=<<___ if ($avx);
-+.LSEH_info_sha1_block_data_order_avx:
-+ .byte 9,0,0,0
-+ .rva ssse3_handler
-+ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
- ___
- }
-
-diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/crypto/sha/asm/sha1-586.pl
---- openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts 2008-07-17 11:50:56.000000000 +0200
-+++ openssl-1.0.0d/crypto/sha/asm/sha1-586.pl 2011-08-24 12:50:56.000000000 +0200
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
-
- # ====================================================================
- # [Re]written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
-@@ -12,6 +12,8 @@
- # commentary below], and in 2006 the rest was rewritten in order to
- # gain freedom to liberate licensing terms.
-
-+# January, September 2004.
-+#
- # It was noted that Intel IA-32 C compiler generates code which
- # performs ~30% *faster* on P4 CPU than original *hand-coded*
- # SHA1 assembler implementation. To address this problem (and
-@@ -31,12 +33,92 @@
- # ----------------------------------------------------------------
- # <appro at fy.chalmers.se>
-
-+# August 2009.
-+#
-+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
-+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
-+# and lighten "pressure" on scratch registers. This resulted in
-+# >12% performance improvement on contemporary AMD cores (with no
-+# degradation on other CPUs:-). Also, the code was revised to maximize
-+# "distance" between instructions producing input to 'lea' instruction
-+# and the 'lea' instruction itself, which is essential for Intel Atom
-+# core and resulted in ~15% improvement.
++ &mov ($inp=@T[1],&DWP(192+4,"esp"));
++ &cmp ($inp,&DWP(192+8,"esp"));
++ &je (&label("done"));
+
-+# October 2010.
-+#
-+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
-+# is to offload message schedule denoted by Wt in NIST specification,
-+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
-+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
-+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
-+# have changed that made it interesting again:
-+#
-+# a) XMM units became faster and wider;
-+# b) instruction set became more versatile;
-+# c) an important observation was made by Max Locktykhin, which made
-+# it possible to reduce amount of instructions required to perform
-+# the operation in question, for further details see
-+# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
++ &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
++ &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
++ &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
++ &vmovdqu(@X[-3&7],&QWP(16,$inp));
++ &vmovdqu(@X[-2&7],&QWP(32,$inp));
++ &vmovdqu(@X[-1&7],&QWP(48,$inp));
++ &add ($inp,64);
++ &vpshufb(@X[-4&7], at X[-4&7], at X[2]); # byte swap
++ &mov (&DWP(192+4,"esp"),$inp);
++ &vmovdqa(&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
+
-+# April 2011.
-+#
-+# Add AVX code path, probably most controversial... The thing is that
-+# switch to AVX alone improves performance by as little as 4% in
-+# comparison to SSSE3 code path. But below result doesn't look like
-+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
-+# pair of µ-ops, and it's the additional µ-ops, two per round, that
-+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
-+# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with
-+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
-+# cycles per processed byte. But 'sh[rl]d' is not something that used
-+# to be fast, nor does it appear to be fast in upcoming Bulldozer
-+# [according to its optimization manual]. Which is why AVX code path
-+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
-+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
-+# makes no sense to keep the AVX code path. If somebody feels that
-+# strongly, it's probably more appropriate to discuss possibility of
-+# using vector rotate XOP on AMD...
++ $Xi=0;
++}
+
-+######################################################################
-+# Current performance is summarized in following table. Numbers are
-+# CPU clock cycles spent to process single byte (less is better).
-+#
-+# x86 SSSE3 AVX
-+# Pentium 15.7 -
-+# PIII 11.5 -
-+# P4 10.6 -
-+# AMD K8 7.1 -
-+# Core2 7.3 6.1/+20% -
-+# Atom 12.5 9.5(*)/+32% -
-+# Westmere 7.3 5.6/+30% -
-+# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
-+#
-+# (*) Loop is 1056 instructions long and expected result is ~8.25.
-+# It remains mystery [to me] why ILP is limited to 1.7.
-+#
-+# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
++sub Xloop_avx()
++{ use integer;
++ my $body = shift;
++ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my ($a,$b,$c,$d,$e);
++
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &vpshufb (@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &vpaddd (@X[$Xi&7], at X[($Xi-4)&7], at X[3]);
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ eval(shift(@insns));
++ &vmovdqa (&QWP(0+16*$Xi,"esp"), at X[$Xi&7]); # X[]+K xfer to IALU
++ eval(shift(@insns));
++ eval(shift(@insns));
++
++ foreach (@insns) { eval; }
++ $Xi++;
++}
++
++sub Xtail_avx()
++{ use integer;
++ my $body = shift;
++ my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
++ my ($a,$b,$c,$d,$e);
++
++ foreach (@insns) { eval; }
++}
++
++&set_label("loop",16);
++ &Xupdate_avx_16_31(\&body_00_19);
++ &Xupdate_avx_16_31(\&body_00_19);
++ &Xupdate_avx_16_31(\&body_00_19);
++ &Xupdate_avx_16_31(\&body_00_19);
++ &Xupdate_avx_32_79(\&body_00_19);
++ &Xupdate_avx_32_79(\&body_20_39);
++ &Xupdate_avx_32_79(\&body_20_39);
++ &Xupdate_avx_32_79(\&body_20_39);
++ &Xupdate_avx_32_79(\&body_20_39);
++ &Xupdate_avx_32_79(\&body_20_39);
++ &Xupdate_avx_32_79(\&body_40_59);
++ &Xupdate_avx_32_79(\&body_40_59);
++ &Xupdate_avx_32_79(\&body_40_59);
++ &Xupdate_avx_32_79(\&body_40_59);
++ &Xupdate_avx_32_79(\&body_40_59);
++ &Xupdate_avx_32_79(\&body_20_39);
++ &Xuplast_avx_80(\&body_20_39); # can jump to "done"
+
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- push(@INC,"${dir}","${dir}../../perlasm");
- require "x86asm.pl";
-
- &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
-
-+$xmm=1; $ymm=0;
-+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
++ $saved_j=$j; @saved_V=@V;
+
-+$ymm=1 if ($xmm &&
-+ `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-+ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
-+ $1>=2.19); # first version supporting AVX
++ &Xloop_avx(\&body_20_39);
++ &Xloop_avx(\&body_20_39);
++ &Xloop_avx(\&body_20_39);
+
-+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
-+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
-+ $1>=2.03); # first version supporting AVX
++ &mov (@T[1],&DWP(192,"esp")); # update context
++ &add ($A,&DWP(0, at T[1]));
++ &add (@T[0],&DWP(4, at T[1])); # $b
++ &add ($C,&DWP(8, at T[1]));
++ &mov (&DWP(0, at T[1]),$A);
++ &add ($D,&DWP(12, at T[1]));
++ &mov (&DWP(4, at T[1]), at T[0]);
++ &add ($E,&DWP(16, at T[1]));
++ &mov (&DWP(8, at T[1]),$C);
++ &mov ($B, at T[0]);
++ &mov (&DWP(12, at T[1]),$D);
++ &mov (&DWP(16, at T[1]),$E);
+
-+&external_label("OPENSSL_ia32cap_X") if ($xmm);
++ &jmp (&label("loop"));
+
++&set_label("done",16); $j=$saved_j; @V=@saved_V;
+
- $A="eax";
- $B="ebx";
- $C="ecx";
-@@ -47,6 +129,10 @@ $tmp1="ebp";
-
- @V=($A,$B,$C,$D,$E,$T);
-
-+$alt=0; # 1 denotes alternative IALU implementation, which performs
-+ # 8% *worse* on P4, same on Westmere and Atom, 2% better on
-+ # Sandy Bridge...
++ &Xtail_avx(\&body_20_39);
++ &Xtail_avx(\&body_20_39);
++ &Xtail_avx(\&body_20_39);
+
- sub BODY_00_15
- {
- local($n,$a,$b,$c,$d,$e,$f)=@_;
-@@ -59,16 +145,18 @@ sub BODY_00_15
- &rotl($tmp1,5); # tmp1=ROTATE(a,5)
- &xor($f,$d);
- &add($tmp1,$e); # tmp1+=e;
-- &and($f,$b);
-- &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
-+ &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded
- # with xi, also note that e becomes
- # f in next round...
-- &xor($f,$d); # f holds F_00_19(b,c,d)
-+ &and($f,$b);
- &rotr($b,2); # b=ROTATE(b,30)
-- &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
-+ &xor($f,$d); # f holds F_00_19(b,c,d)
-+ &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
-
-- if ($n==15) { &add($f,$tmp1); } # f+=tmp1
-+ if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
-+ &add($f,$tmp1); } # f+=tmp1
- else { &add($tmp1,$f); } # f becomes a in next round
-+ &mov($tmp1,$a) if ($alt && $n==15);
- }
-
- sub BODY_16_19
-@@ -77,22 +165,41 @@ sub BODY_16_19
-
- &comment("16_19 $n");
-
-- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-- &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
-- &xor($f,&swtmp(($n+2)%16));
-- &xor($tmp1,$d);
-- &xor($f,&swtmp(($n+8)%16));
-- &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d)
-- &rotr($b,2); # b=ROTATE(b,30)
-+if ($alt) {
-+ &xor($c,$d);
-+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-+ &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d
-+ &xor($f,&swtmp(($n+8)%16));
-+ &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
-+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
-+ &rotl($f,1); # f=ROTATE(f,1)
-+ &add($e,$tmp1); # e+=F_00_19(b,c,d)
-+ &xor($c,$d); # restore $c
-+ &mov($tmp1,$a); # b in next round
-+ &rotr($b,$n==16?2:7); # b=ROTATE(b,30)
-+ &mov(&swtmp($n%16),$f); # xi=f
-+ &rotl($a,5); # ROTATE(a,5)
-+ &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
-+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
-+ &add($f,$a); # f+=ROTATE(a,5)
-+} else {
-+ &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d)
-+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-+ &xor($tmp1,$d);
-+ &xor($f,&swtmp(($n+8)%16));
-+ &and($tmp1,$b);
- &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
- &rotl($f,1); # f=ROTATE(f,1)
- &xor($tmp1,$d); # tmp1=F_00_19(b,c,d)
-- &mov(&swtmp($n%16),$f); # xi=f
-- &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
-- &mov($e,$a); # e becomes volatile
-- &rotl($e,5); # e=ROTATE(a,5)
-- &add($f,$tmp1); # f+=F_00_19(b,c,d)
-- &add($f,$e); # f+=ROTATE(a,5)
-+ &add($e,$tmp1); # e+=F_00_19(b,c,d)
-+ &mov($tmp1,$a);
-+ &rotr($b,2); # b=ROTATE(b,30)
-+ &mov(&swtmp($n%16),$f); # xi=f
-+ &rotl($tmp1,5); # ROTATE(a,5)
-+ &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
-+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
-+ &add($f,$tmp1); # f+=ROTATE(a,5)
-+}
- }
-
- sub BODY_20_39
-@@ -102,21 +209,41 @@ sub BODY_20_39
-
- &comment("20_39 $n");
-
-+if ($alt) {
-+ &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c
-+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-+ &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
-+ &xor($f,&swtmp(($n+8)%16));
-+ &add($e,$tmp1); # e+=F_20_39(b,c,d)
-+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
-+ &rotl($f,1); # f=ROTATE(f,1)
-+ &mov($tmp1,$a); # b in next round
-+ &rotr($b,7); # b=ROTATE(b,30)
-+ &mov(&swtmp($n%16),$f) if($n<77);# xi=f
-+ &rotl($a,5); # ROTATE(a,5)
-+ &xor($b,$c) if($n==39);# warm up for BODY_40_59
-+ &and($tmp1,$b) if($n==39);
-+ &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
-+ &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
-+ &add($f,$a); # f+=ROTATE(a,5)
-+ &rotr($a,5) if ($n==79);
-+} else {
- &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d)
-- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-- &rotr($b,2); # b=ROTATE(b,30)
-- &xor($f,&swtmp(($n+2)%16));
-+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
- &xor($tmp1,$c);
- &xor($f,&swtmp(($n+8)%16));
- &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d)
- &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
- &rotl($f,1); # f=ROTATE(f,1)
-- &add($tmp1,$e);
-- &mov(&swtmp($n%16),$f); # xi=f
-- &mov($e,$a); # e becomes volatile
-- &rotl($e,5); # e=ROTATE(a,5)
-- &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e
-- &add($f,$e); # f+=ROTATE(a,5)
-+ &add($e,$tmp1); # e+=F_20_39(b,c,d)
-+ &rotr($b,2); # b=ROTATE(b,30)
-+ &mov($tmp1,$a);
-+ &rotl($tmp1,5); # ROTATE(a,5)
-+ &mov(&swtmp($n%16),$f) if($n<77);# xi=f
-+ &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY
-+ &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
-+ &add($f,$tmp1); # f+=ROTATE(a,5)
-+}
- }
-
- sub BODY_40_59
-@@ -125,41 +252,86 @@ sub BODY_40_59
-
- &comment("40_59 $n");
-
-- &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-- &mov($tmp1,&swtmp(($n+2)%16));
-- &xor($f,$tmp1);
-- &mov($tmp1,&swtmp(($n+8)%16));
-- &xor($f,$tmp1);
-- &mov($tmp1,&swtmp(($n+13)%16));
-- &xor($f,$tmp1); # f holds xa^xb^xc^xd
-- &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d)
-+if ($alt) {
-+ &add($e,$tmp1); # e+=b&(c^d)
-+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-+ &mov($tmp1,$d);
-+ &xor($f,&swtmp(($n+8)%16));
-+ &xor($c,$d); # restore $c
-+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
- &rotl($f,1); # f=ROTATE(f,1)
-- &or($tmp1,$c);
-- &mov(&swtmp($n%16),$f); # xi=f
-- &and($tmp1,$d);
-- &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
-- &mov($e,$b); # e becomes volatile and is used
-- # to calculate F_40_59(b,c,d)
-+ &and($tmp1,$c);
-+ &rotr($b,7); # b=ROTATE(b,30)
-+ &add($e,$tmp1); # e+=c&d
-+ &mov($tmp1,$a); # b in next round
-+ &mov(&swtmp($n%16),$f); # xi=f
-+ &rotl($a,5); # ROTATE(a,5)
-+ &xor($b,$c) if ($n<59);
-+ &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d)
-+ &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
-+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
-+ &add($f,$a); # f+=ROTATE(a,5)
-+} else {
-+ &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d)
-+ &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd)
-+ &xor($tmp1,$d);
-+ &xor($f,&swtmp(($n+8)%16));
-+ &and($tmp1,$b);
-+ &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd
-+ &rotl($f,1); # f=ROTATE(f,1)
-+ &add($tmp1,$e); # b&(c^d)+=e
- &rotr($b,2); # b=ROTATE(b,30)
-- &and($e,$c);
-- &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d)
-- &mov($e,$a);
-- &rotl($e,5); # e=ROTATE(a,5)
-- &add($f,$tmp1); # f+=tmp1;
-+ &mov($e,$a); # e becomes volatile
-+ &rotl($e,5); # ROTATE(a,5)
-+ &mov(&swtmp($n%16),$f); # xi=f
-+ &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
-+ &mov($tmp1,$c);
- &add($f,$e); # f+=ROTATE(a,5)
-+ &and($tmp1,$d);
-+ &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round
-+ &add($f,$tmp1); # f+=c&d
++ &vzeroall();
++
++ &mov (@T[1],&DWP(192,"esp")); # update context
++ &add ($A,&DWP(0, at T[1]));
++ &mov ("esp",&DWP(192+12,"esp")); # restore %esp
++ &add (@T[0],&DWP(4, at T[1])); # $b
++ &add ($C,&DWP(8, at T[1]));
++ &mov (&DWP(0, at T[1]),$A);
++ &add ($D,&DWP(12, at T[1]));
++ &mov (&DWP(4, at T[1]), at T[0]);
++ &add ($E,&DWP(16, at T[1]));
++ &mov (&DWP(8, at T[1]),$C);
++ &mov (&DWP(12, at T[1]),$D);
++ &mov (&DWP(16, at T[1]),$E);
++&function_end("_sha1_block_data_order_avx");
+}
- }
++&set_label("K_XX_XX",64);
++&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
++&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
++&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
++&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
++&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
++}
+ &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
- &function_begin("sha1_block_data_order");
-+if ($xmm) {
-+ &static_label("ssse3_shortcut");
-+ &static_label("avx_shortcut") if ($ymm);
-+ &static_label("K_XX_XX");
+ &asm_finish();
+diff -up openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl.intelopts openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl
+--- openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/sha/asm/sha1-x86_64.pl 2013-02-19 21:19:43.923583195 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+ #
+ # ====================================================================
+ # Written by Andy Polyakov <appro at fy.chalmers.se> for the OpenSSL
+@@ -16,7 +16,7 @@
+ # There was suggestion to mechanically translate 32-bit code, but I
+ # dismissed it, reasoning that x86_64 offers enough register bank
+ # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+-# implementation:-) However! While 64-bit code does performs better
++# implementation:-) However! While 64-bit code does perform better
+ # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+ # x86_64 does offer larger *addressable* bank, but out-of-order core
+ # reaches for even more registers through dynamic aliasing, and EM64T
+@@ -29,6 +29,38 @@
+ # Xeon P4 +65% +0% 9.9
+ # Core2 +60% +10% 7.0
+
++# August 2009.
++#
++# The code was revised to minimize code size and to maximize
++# "distance" between instructions producing input to 'lea'
++# instruction and the 'lea' instruction itself, which is essential
++# for Intel Atom core.
+
-+ &call (&label("pic_point")); # make it PIC!
-+ &set_label("pic_point");
-+ &blindpop($tmp1);
-+ &picmeup($T,"OPENSSL_ia32cap_X",$tmp1,&label("pic_point"));
-+ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
++# October 2010.
++#
++# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
++# is to offload message schedule denoted by Wt in NIST specification,
++# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
++# for background and implementation details. The only difference from
++# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
++# to free temporary registers.
+
-+ &mov ($A,&DWP(0,$T));
-+ &mov ($D,&DWP(4,$T));
-+ &test ($D,1<<9); # check SSSE3 bit
-+ &jz (&label("x86"));
-+ &test ($A,1<<24); # check FXSR bit
-+ &jz (&label("x86"));
-+ if ($ymm) {
-+ &and ($D,1<<28); # mask AVX bit
-+ &and ($A,1<<30); # mask "Intel CPU" bit
-+ &or ($A,$D);
-+ &cmp ($A,1<<28|1<<30);
-+ &je (&label("avx_shortcut"));
-+ }
-+ &jmp (&label("ssse3_shortcut"));
-+ &set_label("x86",16);
-+}
- &mov($tmp1,&wparam(0)); # SHA_CTX *c
- &mov($T,&wparam(1)); # const void *input
- &mov($A,&wparam(2)); # size_t num
-- &stack_push(16); # allocate X[16]
-+ &stack_push(16+3); # allocate X[16]
- &shl($A,6);
- &add($A,$T);
- &mov(&wparam(2),$A); # pointer beyond the end of input
- &mov($E,&DWP(16,$tmp1));# pre-load E
-+ &jmp(&label("loop"));
++# April 2011.
++#
++# Add AVX code path. See sha1-586.pl for further information.
++
++######################################################################
++# Current performance is summarized in following table. Numbers are
++# CPU clock cycles spent to process single byte (less is better).
++#
++# x86_64 SSSE3 AVX
++# P4 9.8 -
++# Opteron 6.6 -
++# Core2 6.7 6.1/+10% -
++# Atom 11.0 9.7/+13% -
++# Westmere 7.1 5.6/+27% -
++# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
++
+ $flavour = shift;
+ $output = shift;
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+@@ -40,6 +72,13 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+ die "can't locate x86_64-xlate.pl";
+
++$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
++ =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
++ $1>=2.19);
++$avx=1 if (!$avx && $flavour =~ /nasm/ &&
++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
++ $1>=2.03);
++
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+
+ $ctx="%rdi"; # 1st arg
+@@ -51,196 +90,994 @@ $ctx="%r8";
+ $inp="%r9";
+ $num="%r10";
+
+-$xi="%eax";
+-$t0="%ebx";
+-$t1="%ecx";
+-$A="%edx";
+-$B="%esi";
+-$C="%edi";
+-$D="%ebp";
+-$E="%r11d";
+-$T="%r12d";
+-
+- at V=($A,$B,$C,$D,$E,$T);
++$t0="%eax";
++$t1="%ebx";
++$t2="%ecx";
++ at xi=("%edx","%ebp");
++$A="%esi";
++$B="%edi";
++$C="%r11d";
++$D="%r12d";
++$E="%r13d";
+
+-sub PROLOGUE {
+-my $func=shift;
+-$code.=<<___;
+-.globl $func
+-.type $func,\@function,3
+-.align 16
+-$func:
+- push %rbx
+- push %rbp
+- push %r12
+- mov %rsp,%r11
+- mov %rdi,$ctx # reassigned argument
+- sub \$`8+16*4`,%rsp
+- mov %rsi,$inp # reassigned argument
+- and \$-64,%rsp
+- mov %rdx,$num # reassigned argument
+- mov %r11,`16*4`(%rsp)
+-.Lprologue:
+-
+- mov 0($ctx),$A
+- mov 4($ctx),$B
+- mov 8($ctx),$C
+- mov 12($ctx),$D
+- mov 16($ctx),$E
+-___
+-}
+-
+-sub EPILOGUE {
+-my $func=shift;
+-$code.=<<___;
+- mov `16*4`(%rsp),%rsi
+- mov (%rsi),%r12
+- mov 8(%rsi),%rbp
+- mov 16(%rsi),%rbx
+- lea 24(%rsi),%rsp
+-.Lepilogue:
+- ret
+-.size $func,.-$func
+-___
+-}
++ at V=($A,$B,$C,$D,$E);
+
+ sub BODY_00_19 {
+-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
++my ($i,$a,$b,$c,$d,$e)=@_;
+ my $j=$i+1;
+ $code.=<<___ if ($i==0);
+- mov `4*$i`($inp),$xi
+- `"bswap $xi" if(!defined($host))`
+- mov $xi,`4*$i`(%rsp)
++ mov `4*$i`($inp),$xi[0]
++ bswap $xi[0]
++ mov $xi[0],`4*$i`(%rsp)
+ ___
+ $code.=<<___ if ($i<15);
+- lea 0x5a827999($xi,$e),$f
+ mov $c,$t0
+- mov `4*$j`($inp),$xi
+- mov $a,$e
++ mov `4*$j`($inp),$xi[1]
++ mov $a,$t2
+ xor $d,$t0
+- `"bswap $xi" if(!defined($host))`
+- rol \$5,$e
++ bswap $xi[1]
++ rol \$5,$t2
++ lea 0x5a827999($xi[0],$e),$e
+ and $b,$t0
+- mov $xi,`4*$j`(%rsp)
+- add $e,$f
++ mov $xi[1],`4*$j`(%rsp)
++ add $t2,$e
+ xor $d,$t0
+ rol \$30,$b
+- add $t0,$f
++ add $t0,$e
+ ___
+ $code.=<<___ if ($i>=15);
+- lea 0x5a827999($xi,$e),$f
+- mov `4*($j%16)`(%rsp),$xi
++ mov `4*($j%16)`(%rsp),$xi[1]
+ mov $c,$t0
+- mov $a,$e
+- xor `4*(($j+2)%16)`(%rsp),$xi
++ mov $a,$t2
++ xor `4*(($j+2)%16)`(%rsp),$xi[1]
+ xor $d,$t0
+- rol \$5,$e
+- xor `4*(($j+8)%16)`(%rsp),$xi
++ rol \$5,$t2
++ xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ and $b,$t0
+- add $e,$f
+- xor `4*(($j+13)%16)`(%rsp),$xi
++ lea 0x5a827999($xi[0],$e),$e
++ xor `4*(($j+13)%16)`(%rsp),$xi[1]
+ xor $d,$t0
++ rol \$1,$xi[1]
++ add $t2,$e
+ rol \$30,$b
+- add $t0,$f
+- rol \$1,$xi
+- mov $xi,`4*($j%16)`(%rsp)
++ mov $xi[1],`4*($j%16)`(%rsp)
++ add $t0,$e
+ ___
++unshift(@xi,pop(@xi));
+ }
-- &set_label("loop",16);
-+&set_label("loop",16);
+ sub BODY_20_39 {
+-my ($i,$a,$b,$c,$d,$e,$f)=@_;
++my ($i,$a,$b,$c,$d,$e)=@_;
+ my $j=$i+1;
+ my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
+ $code.=<<___ if ($i<79);
+- lea $K($xi,$e),$f
+- mov `4*($j%16)`(%rsp),$xi
++ mov `4*($j%16)`(%rsp),$xi[1]
+ mov $c,$t0
+- mov $a,$e
+- xor `4*(($j+2)%16)`(%rsp),$xi
++ mov $a,$t2
++ xor `4*(($j+2)%16)`(%rsp),$xi[1]
+ xor $b,$t0
+- rol \$5,$e
+- xor `4*(($j+8)%16)`(%rsp),$xi
++ rol \$5,$t2
++ lea $K($xi[0],$e),$e
++ xor `4*(($j+8)%16)`(%rsp),$xi[1]
+ xor $d,$t0
+- add $e,$f
+- xor `4*(($j+13)%16)`(%rsp),$xi
++ add $t2,$e
++ xor `4*(($j+13)%16)`(%rsp),$xi[1]
+ rol \$30,$b
+- add $t0,$f
+- rol \$1,$xi
++ add $t0,$e
++ rol \$1,$xi[1]
+ ___
+ $code.=<<___ if ($i<76);
+- mov $xi,`4*($j%16)`(%rsp)
++ mov $xi[1],`4*($j%16)`(%rsp)
+ ___
+ $code.=<<___ if ($i==79);
+- lea $K($xi,$e),$f
+ mov $c,$t0
+- mov $a,$e
++ mov $a,$t2
+ xor $b,$t0
+- rol \$5,$e
++ lea $K($xi[0],$e),$e
++ rol \$5,$t2
+ xor $d,$t0
+- add $e,$f
++ add $t2,$e
+ rol \$30,$b
+- add $t0,$f
++ add $t0,$e
+ ___
++unshift(@xi,pop(@xi));
+ }
- # copy input chunk to X, but reversing byte order!
- for ($i=0; $i<16; $i+=4)
-@@ -213,8 +385,845 @@ sub BODY_40_59
- &mov(&DWP(16,$tmp1),$C);
- &jb(&label("loop"));
+ sub BODY_40_59 {
+-my ($i,$a,$b,$c,$d,$e,$f)=@_;
++my ($i,$a,$b,$c,$d,$e)=@_;
+ my $j=$i+1;
+ $code.=<<___;
+- lea 0x8f1bbcdc($xi,$e),$f
+- mov `4*($j%16)`(%rsp),$xi
+- mov $b,$t0
+- mov $b,$t1
+- xor `4*(($j+2)%16)`(%rsp),$xi
+- mov $a,$e
+- and $c,$t0
+- xor `4*(($j+8)%16)`(%rsp),$xi
+- or $c,$t1
+- rol \$5,$e
+- xor `4*(($j+13)%16)`(%rsp),$xi
+- and $d,$t1
+- add $e,$f
+- rol \$1,$xi
+- or $t1,$t0
++ mov `4*($j%16)`(%rsp),$xi[1]
++ mov $c,$t0
++ mov $c,$t1
++ xor `4*(($j+2)%16)`(%rsp),$xi[1]
++ and $d,$t0
++ mov $a,$t2
++ xor `4*(($j+8)%16)`(%rsp),$xi[1]
++ xor $d,$t1
++ lea 0x8f1bbcdc($xi[0],$e),$e
++ rol \$5,$t2
++ xor `4*(($j+13)%16)`(%rsp),$xi[1]
++ add $t0,$e
++ and $b,$t1
++ rol \$1,$xi[1]
++ add $t1,$e
+ rol \$30,$b
+- mov $xi,`4*($j%16)`(%rsp)
+- add $t0,$f
++ mov $xi[1],`4*($j%16)`(%rsp)
++ add $t2,$e
+ ___
++unshift(@xi,pop(@xi));
+ }
-- &stack_pop(16);
-+ &stack_pop(16+3);
- &function_end("sha1_block_data_order");
+-$code=".text\n";
++$code.=<<___;
++.text
++.extern OPENSSL_ia32cap_X
+
-+if ($xmm) {
-+######################################################################
-+# The SSSE3 implementation.
-+#
-+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
-+# 32 elements of the message schedule or Xupdate outputs. First 4
-+# quadruples are simply byte-swapped input, next 4 are calculated
-+# according to method originally suggested by Dean Gaudet (modulo
-+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
-+# collected, it switches to routine proposed by Max Locktyukhin.
-+#
-+# Calculations inevitably require temporary reqisters, and there are
-+# no %xmm registers left to spare. For this reason part of the ring
-+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
-+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
-+# X[-5], and X[4] - X[-4]...
-+#
-+# Another notable optimization is aggressive stack frame compression
-+# aiming to minimize amount of 9-byte instructions...
-+#
-+# Yet another notable optimization is "jumping" $B variable. It means
-+# that there is no register permanently allocated for $B value. This
-+# allowed to eliminate one instruction from body_20_39...
-+#
-+my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
-+my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
-+my @V=($A,$B,$C,$D,$E);
-+my $j=0; # hash round
-+my @T=($T,$tmp1);
-+my $inp;
++.globl sha1_block_data_order
++.type sha1_block_data_order,\@function,3
++.align 16
++sha1_block_data_order:
++ mov OPENSSL_ia32cap_X+0(%rip),%r9d
++ mov OPENSSL_ia32cap_X+4(%rip),%r8d
++ test \$`1<<9`,%r8d # check SSSE3 bit
++ jz .Lialu
++___
++$code.=<<___ if ($avx);
++ and \$`1<<28`,%r8d # mask AVX bit
++ and \$`1<<30`,%r9d # mask "Intel CPU" bit
++ or %r9d,%r8d
++ cmp \$`1<<28|1<<30`,%r8d
++ je _avx_shortcut
++___
++$code.=<<___;
++ jmp _ssse3_shortcut
+
-+my $_rol=sub { &rol(@_) };
-+my $_ror=sub { &ror(@_) };
++.align 16
++.Lialu:
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ mov %rsp,%r11
++ mov %rdi,$ctx # reassigned argument
++ sub \$`8+16*4`,%rsp
++ mov %rsi,$inp # reassigned argument
++ and \$-64,%rsp
++ mov %rdx,$num # reassigned argument
++ mov %r11,`16*4`(%rsp)
++.Lprologue:
+
-+&function_begin("_sha1_block_data_order_ssse3");
-+ &call (&label("pic_point")); # make it PIC!
-+ &set_label("pic_point");
-+ &blindpop($tmp1);
-+ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-+&set_label("ssse3_shortcut");
++ mov 0($ctx),$A
++ mov 4($ctx),$B
++ mov 8($ctx),$C
++ mov 12($ctx),$D
++ mov 16($ctx),$E
++ jmp .Lloop
+
+-&PROLOGUE("sha1_block_data_order");
+-$code.=".align 4\n.Lloop:\n";
++.align 16
++.Lloop:
++___
+ for($i=0;$i<20;$i++) { &BODY_00_19($i, at V); unshift(@V,pop(@V)); }
+ for(;$i<40;$i++) { &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+ for(;$i<60;$i++) { &BODY_40_59($i, at V); unshift(@V,pop(@V)); }
+ for(;$i<80;$i++) { &BODY_20_39($i, at V); unshift(@V,pop(@V)); }
+ $code.=<<___;
+- add 0($ctx),$E
+- add 4($ctx),$T
+- add 8($ctx),$A
+- add 12($ctx),$B
+- add 16($ctx),$C
+- mov $E,0($ctx)
+- mov $T,4($ctx)
+- mov $A,8($ctx)
+- mov $B,12($ctx)
+- mov $C,16($ctx)
+-
+- xchg $E,$A # mov $E,$A
+- xchg $T,$B # mov $T,$B
+- xchg $E,$C # mov $A,$C
+- xchg $T,$D # mov $B,$D
+- # mov $C,$E
+- lea `16*4`($inp),$inp
++ add 0($ctx),$A
++ add 4($ctx),$B
++ add 8($ctx),$C
++ add 12($ctx),$D
++ add 16($ctx),$E
++ mov $A,0($ctx)
++ mov $B,4($ctx)
++ mov $C,8($ctx)
++ mov $D,12($ctx)
++ mov $E,16($ctx)
+
-+ &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19
-+ &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39
-+ &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59
-+ &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79
-+ &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask
+ sub \$1,$num
++ lea `16*4`($inp),$inp
+ jnz .Lloop
++
++ mov `16*4`(%rsp),%rsi
++ mov (%rsi),%r13
++ mov 8(%rsi),%r12
++ mov 16(%rsi),%rbp
++ mov 24(%rsi),%rbx
++ lea 32(%rsi),%rsp
++.Lepilogue:
++ ret
++.size sha1_block_data_order,.-sha1_block_data_order
+ ___
+-&EPILOGUE("sha1_block_data_order");
++{{{
++my $Xi=4;
++my @X=map("%xmm$_",(4..7,0..3));
++my @Tx=map("%xmm$_",(8..10));
++my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
++my @T=("%esi","%edi");
++my $j=0;
++my $K_XX_XX="%r11";
++
++my $_rol=sub { &rol(@_) };
++my $_ror=sub { &ror(@_) };
+
-+ &mov ($E,&wparam(0)); # load argument block
-+ &mov ($inp=@T[1],&wparam(1));
-+ &mov ($D,&wparam(2));
-+ &mov (@T[0],"esp");
+ $code.=<<___;
+-.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.type sha1_block_data_order_ssse3,\@function,3
+ .align 16
++sha1_block_data_order_ssse3:
++_ssse3_shortcut:
++ push %rbx
++ push %rbp
++ push %r12
++ lea `-64-($win64?5*16:0)`(%rsp),%rsp
++___
++$code.=<<___ if ($win64);
++ movaps %xmm6,64+0(%rsp)
++ movaps %xmm7,64+16(%rsp)
++ movaps %xmm8,64+32(%rsp)
++ movaps %xmm9,64+48(%rsp)
++ movaps %xmm10,64+64(%rsp)
++.Lprologue_ssse3:
++___
++$code.=<<___;
++ mov %rdi,$ctx # reassigned argument
++ mov %rsi,$inp # reassigned argument
++ mov %rdx,$num # reassigned argument
+
-+ # stack frame layout
-+ #
-+ # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
-+ # X[4]+K X[5]+K X[6]+K X[7]+K
-+ # X[8]+K X[9]+K X[10]+K X[11]+K
-+ # X[12]+K X[13]+K X[14]+K X[15]+K
-+ #
-+ # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
-+ # X[4] X[5] X[6] X[7]
-+ # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
-+ #
-+ # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
-+ # K_40_59 K_40_59 K_40_59 K_40_59
-+ # K_60_79 K_60_79 K_60_79 K_60_79
-+ # K_00_19 K_00_19 K_00_19 K_00_19
-+ # pbswap mask
-+ #
-+ # +192 ctx # argument block
-+ # +196 inp
-+ # +200 end
-+ # +204 esp
-+ &sub ("esp",208);
-+ &and ("esp",-64);
++ shl \$6,$num
++ add $inp,$num
++ lea K_XX_XX(%rip),$K_XX_XX
+
-+ &movdqa (&QWP(112+0,"esp"), at X[4]); # copy constants
-+ &movdqa (&QWP(112+16,"esp"), at X[5]);
-+ &movdqa (&QWP(112+32,"esp"), at X[6]);
-+ &shl ($D,6); # len*64
-+ &movdqa (&QWP(112+48,"esp"), at X[3]);
-+ &add ($D,$inp); # end of input
-+ &movdqa (&QWP(112+64,"esp"), at X[2]);
-+ &add ($inp,64);
-+ &mov (&DWP(192+0,"esp"),$E); # save argument block
-+ &mov (&DWP(192+4,"esp"),$inp);
-+ &mov (&DWP(192+8,"esp"),$D);
-+ &mov (&DWP(192+12,"esp"), at T[0]); # save original %esp
++ mov 0($ctx),$A # load context
++ mov 4($ctx),$B
++ mov 8($ctx),$C
++ mov 12($ctx),$D
++ mov $B, at T[0] # magic seed
++ mov 16($ctx),$E
+
-+ &mov ($A,&DWP(0,$E)); # load context
-+ &mov ($B,&DWP(4,$E));
-+ &mov ($C,&DWP(8,$E));
-+ &mov ($D,&DWP(12,$E));
-+ &mov ($E,&DWP(16,$E));
-+ &mov (@T[0],$B); # magic seed
++ movdqa 64($K_XX_XX), at X[2] # pbswap mask
++ movdqa 0($K_XX_XX), at Tx[1] # K_00_19
++ movdqu 0($inp), at X[-4&7] # load input to %xmm[0-3]
++ movdqu 16($inp), at X[-3&7]
++ movdqu 32($inp), at X[-2&7]
++ movdqu 48($inp), at X[-1&7]
++ pshufb @X[2], at X[-4&7] # byte swap
++ add \$64,$inp
++ pshufb @X[2], at X[-3&7]
++ pshufb @X[2], at X[-2&7]
++ pshufb @X[2], at X[-1&7]
++ paddd @Tx[1], at X[-4&7] # add K_00_19
++ paddd @Tx[1], at X[-3&7]
++ paddd @Tx[1], at X[-2&7]
++ movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
++ psubd @Tx[1], at X[-4&7] # restore X[]
++ movdqa @X[-3&7],16(%rsp)
++ psubd @Tx[1], at X[-3&7]
++ movdqa @X[-2&7],32(%rsp)
++ psubd @Tx[1], at X[-2&7]
++ jmp .Loop_ssse3
++___
+
-+ &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
-+ &movdqu (@X[-3&7],&QWP(-48,$inp));
-+ &movdqu (@X[-2&7],&QWP(-32,$inp));
-+ &movdqu (@X[-1&7],&QWP(-16,$inp));
-+ &pshufb (@X[-4&7], at X[2]); # byte swap
-+ &pshufb (@X[-3&7], at X[2]);
-+ &pshufb (@X[-2&7], at X[2]);
-+ &movdqa (&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
-+ &pshufb (@X[-1&7], at X[2]);
-+ &paddd (@X[-4&7], at X[3]); # add K_00_19
-+ &paddd (@X[-3&7], at X[3]);
-+ &paddd (@X[-2&7], at X[3]);
-+ &movdqa (&QWP(0,"esp"), at X[-4&7]); # X[]+K xfer to IALU
-+ &psubd (@X[-4&7], at X[3]); # restore X[]
-+ &movdqa (&QWP(0+16,"esp"), at X[-3&7]);
-+ &psubd (@X[-3&7], at X[3]);
-+ &movdqa (&QWP(0+32,"esp"), at X[-2&7]);
-+ &psubd (@X[-2&7], at X[3]);
-+ &movdqa (@X[0], at X[-3&7]);
-+ &jmp (&label("loop"));
++sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
++{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
++ my $arg = pop;
++ $arg = "\$$arg" if ($arg*1 eq $arg);
++ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
++}
+
-+######################################################################
-+# SSE instruction sequence is first broken to groups of indepentent
-+# instructions, independent in respect to their inputs and shifter
-+# (not all architectures have more than one). Then IALU instructions
-+# are "knitted in" between the SSE groups. Distance is maintained for
-+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
-+# [which allegedly also implements SSSE3]...
-+#
-+# Temporary registers usage. X[2] is volatile at the entry and at the
-+# end is restored from backtrace ring buffer. X[3] is expected to
-+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
-+# from previous round, it becomes volatile the moment the value is
-+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
-+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
-+# end it is loaded with next K_XX_XX [which becomes X[3] in next
-+# round]...
-+#
+sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
+ my ($a,$b,$c,$d,$e);
+
++ &movdqa (@X[0], at X[-3&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
++ &movdqa (@Tx[0], at X[-1&7]);
+ &palignr(@X[0], at X[-4&7],8); # compose "X[-14]" in "X[0]"
-+ &movdqa (@X[2], at X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &paddd (@X[3], at X[-1&7]);
-+ &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
++ &paddd (@Tx[1], at X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &psrldq (@X[2],4); # "X[-3]", 3 dwords
++ &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &pxor (@X[0], at X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &pxor (@X[2], at X[-2&7]); # "X[-3]"^"X[-8]"
++ &pxor (@Tx[0], at X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &pxor (@X[0], at X[2]); # "X[0]"^="X[-3]"^"X[-8]"
++ &pxor (@X[0], at Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
++ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &movdqa (@X[4], at X[0]);
-+ &movdqa (@X[2], at X[0]);
++ &movdqa (@Tx[2], at X[0]);
++ &movdqa (@Tx[0], at X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &pslldq (@X[4],12); # "X[0]"<<96, extract one dword
++ &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
+ &paddd (@X[0], at X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &psrld (@X[2],31);
++ &psrld (@Tx[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &movdqa (@X[3], at X[4]);
++ &movdqa (@Tx[1], at Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &psrld (@X[4],30);
-+ &por (@X[0], at X[2]); # "X[0]"<<<=1
++ &psrld (@Tx[2],30);
++ &por (@X[0], at Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &pslld (@X[3],2);
-+ &pxor (@X[0], at X[4]);
++ &pslld (@Tx[1],2);
++ &pxor (@X[0], at Tx[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
++ &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &pxor (@X[0], at X[3]); # "X[0]"^=("X[0]"<<96)<<<2
-+ &movdqa (@X[1], at X[-2&7]) if ($Xi<7);
-+ eval(shift(@insns));
-+ eval(shift(@insns));
++ &pxor (@X[0], at Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
++ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
@@ -5355,35 +5220,34 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
-+ &movdqa (@X[2], at X[-1&7]) if ($Xi==8);
++ &movdqa (@Tx[0], at X[-1&7]) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
+ &pxor (@X[0], at X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
-+ &palignr(@X[2], at X[-2&7],8); # compose "X[-6]"
++ &palignr(@Tx[0], at X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
+ &pxor (@X[0], at X[-7&7]); # "X[0]"^="X[-28]"
-+ &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]); # save X[] to backtrace buffer
+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ if ($Xi%5) {
-+ &movdqa (@X[4], at X[3]); # "perpetuate" K_XX_XX...
-+ } else { # ... or load next one
-+ &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
-+ }
-+ &paddd (@X[3], at X[-1&7]);
++ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
++ if ($Xi%5) {
++ &movdqa (@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
++ } else { # ... or load next one
++ &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
++ }
++ &paddd (@Tx[1], at X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
-+ &pxor (@X[0], at X[2]); # "X[0]"^="X[-6]"
++ &pxor (@X[0], at Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
-+ &movdqa (@X[2], at X[0]);
-+ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
++ &movdqa (@Tx[0], at X[0]);
++ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
@@ -5392,7 +5256,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ &pslld (@X[0],2);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
-+ &psrld (@X[2],30);
++ &psrld (@Tx[0],30);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
@@ -5400,21 +5264,21 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
-+ &por (@X[0], at X[2]); # "X[0]"<<<=2
++ &por (@X[0], at Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
-+ &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
++ &movdqa (@Tx[1], at X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ eval(shift(@insns)); # ror
-+ &movdqa (@X[3], at X[0]) if ($Xi<19);
++ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
++ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
@@ -5424,30 +5288,29 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
-+ &paddd (@X[3], at X[-1&7]);
++ &paddd (@Tx[1], at X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer IALU
++ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
-+ &mov ($inp=@T[1],&DWP(192+4,"esp"));
-+ &cmp ($inp,&DWP(192+8,"esp"));
-+ &je (&label("done"));
++ &cmp ($inp,$num);
++ &je (".Ldone_ssse3");
+
-+ &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19
-+ &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask
-+ &movdqu (@X[-4&7],&QWP(0,$inp)); # load input
-+ &movdqu (@X[-3&7],&QWP(16,$inp));
-+ &movdqu (@X[-2&7],&QWP(32,$inp));
-+ &movdqu (@X[-1&7],&QWP(48,$inp));
-+ &add ($inp,64);
++ unshift(@Tx,pop(@Tx));
++
++ &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
++ &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
++ &movdqu (@X[-4&7],"0($inp)"); # load input
++ &movdqu (@X[-3&7],"16($inp)");
++ &movdqu (@X[-2&7],"32($inp)");
++ &movdqu (@X[-1&7],"48($inp)");
+ &pshufb (@X[-4&7], at X[2]); # byte swap
-+ &mov (&DWP(192+4,"esp"),$inp);
-+ &movdqa (&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
++ &add ($inp,64);
+
+ $Xi=0;
+}
@@ -5463,15 +5326,15 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ &pshufb (@X[($Xi-3)&7], at X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &paddd (@X[($Xi-4)&7], at X[3]);
++ &paddd (@X[($Xi-4)&7], at Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &movdqa (&QWP(0+16*$Xi,"esp"), at X[($Xi-4)&7]); # X[]+K xfer to IALU
++ &movdqa (eval(16*$Xi)."(%rsp)", at X[($Xi-4)&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &psubd (@X[($Xi-4)&7], at X[3]);
++ &psubd (@X[($Xi-4)&7], at Tx[1]);
+
+ foreach (@insns) { eval; }
+ $Xi++;
@@ -5489,7 +5352,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+sub body_00_19 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
-+ '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer
++ '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
+ '&xor ($c,$d);',
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
@@ -5505,7 +5368,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+sub body_20_39 () {
+ (
+ '($a,$b,$c,$d,$e)=@V;'.
-+ '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
++ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&xor (@T[0],$d);', # ($b^$d)
+ '&mov (@T[1],$a);', # $b in next round
+ '&$_rol ($a,5);',
@@ -5521,7 +5384,7 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ '($a,$b,$c,$d,$e)=@V;'.
+ '&mov (@T[1],$c);',
+ '&xor ($c,$d);',
-+ '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
++ '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
+ '&and (@T[1],$d);',
+ '&and (@T[0],$c);', # ($b&($c^$d))
+ '&$_ror ($b,7);', # $b>>>2
@@ -5533,8 +5396,10 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ );
+}
-+
-+&set_label("loop",16);
++$code.=<<___;
++.align 16
++.Loop_ssse3:
++___
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
+ &Xupdate_ssse3_16_31(\&body_00_19);
@@ -5559,133 +5424,125 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ &Xloop_ssse3(\&body_20_39);
+ &Xloop_ssse3(\&body_20_39);
+
-+ &mov (@T[1],&DWP(192,"esp")); # update context
-+ &add ($A,&DWP(0, at T[1]));
-+ &add (@T[0],&DWP(4, at T[1])); # $b
-+ &add ($C,&DWP(8, at T[1]));
-+ &mov (&DWP(0, at T[1]),$A);
-+ &add ($D,&DWP(12, at T[1]));
-+ &mov (&DWP(4, at T[1]), at T[0]);
-+ &add ($E,&DWP(16, at T[1]));
-+ &mov (&DWP(8, at T[1]),$C);
-+ &mov ($B, at T[0]);
-+ &mov (&DWP(12, at T[1]),$D);
-+ &mov (&DWP(16, at T[1]),$E);
-+ &movdqa (@X[0], at X[-3&7]);
-+
-+ &jmp (&label("loop"));
++$code.=<<___;
++ add 0($ctx),$A # update context
++ add 4($ctx), at T[0]
++ add 8($ctx),$C
++ add 12($ctx),$D
++ mov $A,0($ctx)
++ add 16($ctx),$E
++ mov @T[0],4($ctx)
++ mov @T[0],$B # magic seed
++ mov $C,8($ctx)
++ mov $D,12($ctx)
++ mov $E,16($ctx)
++ jmp .Loop_ssse3
+
-+&set_label("done",16); $j=$saved_j; @V=@saved_V;
++.align 16
++.Ldone_ssse3:
++___
++ $j=$saved_j; @V=@saved_V;
+
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+ &Xtail_ssse3(\&body_20_39);
+
-+ &mov (@T[1],&DWP(192,"esp")); # update context
-+ &add ($A,&DWP(0, at T[1]));
-+ &mov ("esp",&DWP(192+12,"esp")); # restore %esp
-+ &add (@T[0],&DWP(4, at T[1])); # $b
-+ &add ($C,&DWP(8, at T[1]));
-+ &mov (&DWP(0, at T[1]),$A);
-+ &add ($D,&DWP(12, at T[1]));
-+ &mov (&DWP(4, at T[1]), at T[0]);
-+ &add ($E,&DWP(16, at T[1]));
-+ &mov (&DWP(8, at T[1]),$C);
-+ &mov (&DWP(12, at T[1]),$D);
-+ &mov (&DWP(16, at T[1]),$E);
-+
-+&function_end("_sha1_block_data_order_ssse3");
++$code.=<<___;
++ add 0($ctx),$A # update context
++ add 4($ctx), at T[0]
++ add 8($ctx),$C
++ mov $A,0($ctx)
++ add 12($ctx),$D
++ mov @T[0],4($ctx)
++ add 16($ctx),$E
++ mov $C,8($ctx)
++ mov $D,12($ctx)
++ mov $E,16($ctx)
++___
++$code.=<<___ if ($win64);
++ movaps 64+0(%rsp),%xmm6
++ movaps 64+16(%rsp),%xmm7
++ movaps 64+32(%rsp),%xmm8
++ movaps 64+48(%rsp),%xmm9
++ movaps 64+64(%rsp),%xmm10
++___
++$code.=<<___;
++ lea `64+($win64?6*16:0)`(%rsp),%rsi
++ mov 0(%rsi),%r12
++ mov 8(%rsi),%rbp
++ mov 16(%rsi),%rbx
++ lea 24(%rsi),%rsp
++.Lepilogue_ssse3:
++ ret
++.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
++___
+
-+if ($ymm) {
-+my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded
-+my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4
-+my @V=($A,$B,$C,$D,$E);
-+my $j=0; # hash round
-+my @T=($T,$tmp1);
-+my $inp;
++if ($avx) {
++my $Xi=4;
++my @X=map("%xmm$_",(4..7,0..3));
++my @Tx=map("%xmm$_",(8..10));
++my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
++my @T=("%esi","%edi");
++my $j=0;
++my $K_XX_XX="%r11";
+
+my $_rol=sub { &shld(@_[0], at _) };
+my $_ror=sub { &shrd(@_[0], at _) };
+
-+&function_begin("_sha1_block_data_order_avx");
-+ &call (&label("pic_point")); # make it PIC!
-+ &set_label("pic_point");
-+ &blindpop($tmp1);
-+ &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
-+&set_label("avx_shortcut");
-+ &vzeroall();
-+
-+ &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19
-+ &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39
-+ &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59
-+ &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79
-+ &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask
-+
-+ &mov ($E,&wparam(0)); # load argument block
-+ &mov ($inp=@T[1],&wparam(1));
-+ &mov ($D,&wparam(2));
-+ &mov (@T[0],"esp");
-+
-+ # stack frame layout
-+ #
-+ # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area
-+ # X[4]+K X[5]+K X[6]+K X[7]+K
-+ # X[8]+K X[9]+K X[10]+K X[11]+K
-+ # X[12]+K X[13]+K X[14]+K X[15]+K
-+ #
-+ # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area
-+ # X[4] X[5] X[6] X[7]
-+ # X[8] X[9] X[10] X[11] # even borrowed for K_00_19
-+ #
-+ # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants
-+ # K_40_59 K_40_59 K_40_59 K_40_59
-+ # K_60_79 K_60_79 K_60_79 K_60_79
-+ # K_00_19 K_00_19 K_00_19 K_00_19
-+ # pbswap mask
-+ #
-+ # +192 ctx # argument block
-+ # +196 inp
-+ # +200 end
-+ # +204 esp
-+ &sub ("esp",208);
-+ &and ("esp",-64);
++$code.=<<___;
++.type sha1_block_data_order_avx,\@function,3
++.align 16
++sha1_block_data_order_avx:
++_avx_shortcut:
++ push %rbx
++ push %rbp
++ push %r12
++ lea `-64-($win64?5*16:0)`(%rsp),%rsp
++___
++$code.=<<___ if ($win64);
++ movaps %xmm6,64+0(%rsp)
++ movaps %xmm7,64+16(%rsp)
++ movaps %xmm8,64+32(%rsp)
++ movaps %xmm9,64+48(%rsp)
++ movaps %xmm10,64+64(%rsp)
++.Lprologue_avx:
++___
++$code.=<<___;
++ mov %rdi,$ctx # reassigned argument
++ mov %rsi,$inp # reassigned argument
++ mov %rdx,$num # reassigned argument
++ vzeroall
+
-+ &vmovdqa(&QWP(112+0,"esp"), at X[4]); # copy constants
-+ &vmovdqa(&QWP(112+16,"esp"), at X[5]);
-+ &vmovdqa(&QWP(112+32,"esp"), at X[6]);
-+ &shl ($D,6); # len*64
-+ &vmovdqa(&QWP(112+48,"esp"), at X[3]);
-+ &add ($D,$inp); # end of input
-+ &vmovdqa(&QWP(112+64,"esp"), at X[2]);
-+ &add ($inp,64);
-+ &mov (&DWP(192+0,"esp"),$E); # save argument block
-+ &mov (&DWP(192+4,"esp"),$inp);
-+ &mov (&DWP(192+8,"esp"),$D);
-+ &mov (&DWP(192+12,"esp"), at T[0]); # save original %esp
++ shl \$6,$num
++ add $inp,$num
++ lea K_XX_XX(%rip),$K_XX_XX
+
-+ &mov ($A,&DWP(0,$E)); # load context
-+ &mov ($B,&DWP(4,$E));
-+ &mov ($C,&DWP(8,$E));
-+ &mov ($D,&DWP(12,$E));
-+ &mov ($E,&DWP(16,$E));
-+ &mov (@T[0],$B); # magic seed
++ mov 0($ctx),$A # load context
++ mov 4($ctx),$B
++ mov 8($ctx),$C
++ mov 12($ctx),$D
++ mov $B, at T[0] # magic seed
++ mov 16($ctx),$E
+
-+ &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3]
-+ &vmovdqu(@X[-3&7],&QWP(-48,$inp));
-+ &vmovdqu(@X[-2&7],&QWP(-32,$inp));
-+ &vmovdqu(@X[-1&7],&QWP(-16,$inp));
-+ &vpshufb(@X[-4&7], at X[-4&7], at X[2]); # byte swap
-+ &vpshufb(@X[-3&7], at X[-3&7], at X[2]);
-+ &vpshufb(@X[-2&7], at X[-2&7], at X[2]);
-+ &vmovdqa(&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
-+ &vpshufb(@X[-1&7], at X[-1&7], at X[2]);
-+ &vpaddd (@X[0], at X[-4&7], at X[3]); # add K_00_19
-+ &vpaddd (@X[1], at X[-3&7], at X[3]);
-+ &vpaddd (@X[2], at X[-2&7], at X[3]);
-+ &vmovdqa(&QWP(0,"esp"), at X[0]); # X[]+K xfer to IALU
-+ &vmovdqa(&QWP(0+16,"esp"), at X[1]);
-+ &vmovdqa(&QWP(0+32,"esp"), at X[2]);
-+ &jmp (&label("loop"));
++ vmovdqa 64($K_XX_XX), at X[2] # pbswap mask
++ vmovdqa 0($K_XX_XX), at Tx[1] # K_00_19
++ vmovdqu 0($inp), at X[-4&7] # load input to %xmm[0-3]
++ vmovdqu 16($inp), at X[-3&7]
++ vmovdqu 32($inp), at X[-2&7]
++ vmovdqu 48($inp), at X[-1&7]
++ vpshufb @X[2], at X[-4&7], at X[-4&7] # byte swap
++ add \$64,$inp
++ vpshufb @X[2], at X[-3&7], at X[-3&7]
++ vpshufb @X[2], at X[-2&7], at X[-2&7]
++ vpshufb @X[2], at X[-1&7], at X[-1&7]
++ vpaddd @Tx[1], at X[-4&7], at X[0] # add K_00_19
++ vpaddd @Tx[1], at X[-3&7], at X[1]
++ vpaddd @Tx[1], at X[-2&7], at X[2]
++ vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
++ vmovdqa @X[1],16(%rsp)
++ vmovdqa @X[2],32(%rsp)
++ jmp .Loop_avx
++___
+
+sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
+{ use integer;
@@ -5699,70 +5556,68 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpaddd (@X[3], at X[3], at X[-1&7]);
-+ &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]);# save X[] to backtrace buffer
++ &vpaddd (@Tx[1], at Tx[1], at X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpsrldq(@X[2], at X[-1&7],4); # "X[-3]", 3 dwords
++ &vpsrldq(@Tx[0], at X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vpxor (@X[0], at X[0], at X[-4&7]); # "X[0]"^="X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpxor (@X[2], at X[2], at X[-2&7]); # "X[-3]"^"X[-8]"
++ &vpxor (@Tx[0], at Tx[0], at X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpxor (@X[0], at X[0], at X[2]); # "X[0]"^="X[-3]"^"X[-8]"
++ &vpxor (@X[0], at X[0], at Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
++ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpsrld (@X[2], at X[0],31);
++ &vpsrld (@Tx[0], at X[0],31);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpslldq(@X[4], at X[0],12); # "X[0]"<<96, extract one dword
++ &vpslldq(@Tx[2], at X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0], at X[0], at X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpsrld (@X[3], at X[4],30);
-+ &vpor (@X[0], at X[0], at X[2]); # "X[0]"<<<=1
++ &vpsrld (@Tx[1], at Tx[2],30);
++ &vpor (@X[0], at X[0], at Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpslld (@X[4], at X[4],2);
-+ &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer
-+ eval(shift(@insns));
-+ eval(shift(@insns));
-+ &vpxor (@X[0], at X[0], at X[3]);
++ &vpslld (@Tx[2], at Tx[2],2);
++ &vpxor (@X[0], at X[0], at Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vpxor (@X[0], at X[0], at X[4]); # "X[0]"^=("X[0]"<<96)<<<2
++ &vpxor (@X[0], at X[0], at Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX
++ &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+
++
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
++ push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
@@ -5771,34 +5626,33 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my ($a,$b,$c,$d,$e);
+
-+ &vpalignr(@X[2], at X[-1&7], at X[-2&7],8); # compose "X[-6]"
-+ &vpxor (@X[0], at X[0], at X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
++ &vpalignr(@Tx[0], at X[-1&7], at X[-2&7],8); # compose "X[-6]"
++ &vpxor (@X[0], at X[0], at X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
-+ &vpxor (@X[0], at X[0], at X[-7&7]); # "X[0]"^="X[-28]"
-+ &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"), at X[-4&7]); # save X[] to backtrace buffer
-+ eval(shift(@insns));
++ &vpxor (@X[0], at X[0], at X[-7&7]); # "X[0]"^="X[-28]"
+ eval(shift(@insns));
-+ if ($Xi%5) {
-+ &vmovdqa (@X[4], at X[3]); # "perpetuate" K_XX_XX...
-+ } else { # ... or load next one
-+ &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp"));
-+ }
-+ &vpaddd (@X[3], at X[3], at X[-1&7]);
++ eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
++ if ($Xi%5) {
++ &vmovdqa (@Tx[2], at Tx[1]);# "perpetuate" K_XX_XX...
++ } else { # ... or load next one
++ &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
++ }
++ &vpaddd (@Tx[1], at Tx[1], at X[-1&7]);
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
-+ &vpxor (@X[0], at X[0], at X[2]); # "X[0]"^="X[-6]"
++ &vpxor (@X[0], at X[0], at Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+
-+ &vpsrld (@X[2], at X[0],30);
-+ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer to IALU
++ &vpsrld (@Tx[0], at X[0],30);
++ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns)); # ror
@@ -5814,20 +5668,21 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ eval(shift(@insns)); # ror
+ eval(shift(@insns));
+
-+ &vpor (@X[0], at X[0], at X[2]); # "X[0]"<<<=2
++ &vpor (@X[0], at X[0], at Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns));
-+ &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer
++ &vmovdqa (@Tx[1], at X[0]) if ($Xi<19);
+ eval(shift(@insns));
+ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ eval(shift(@insns)); # ror
++ eval(shift(@insns)); # rol
+ eval(shift(@insns));
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++; push(@X,shift(@X)); # "rotate" X[]
++ push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
@@ -5837,30 +5692,29 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ my ($a,$b,$c,$d,$e);
+
+ eval(shift(@insns));
-+ &vpaddd (@X[3], at X[3], at X[-1&7]);
++ &vpaddd (@Tx[1], at Tx[1], at X[-1&7]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
-+ &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"), at X[3]); # X[]+K xfer IALU
++ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)", at Tx[1]); # X[]+K xfer IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
-+ &mov ($inp=@T[1],&DWP(192+4,"esp"));
-+ &cmp ($inp,&DWP(192+8,"esp"));
-+ &je (&label("done"));
++ &cmp ($inp,$num);
++ &je (".Ldone_avx");
+
-+ &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19
-+ &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask
-+ &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input
-+ &vmovdqu(@X[-3&7],&QWP(16,$inp));
-+ &vmovdqu(@X[-2&7],&QWP(32,$inp));
-+ &vmovdqu(@X[-1&7],&QWP(48,$inp));
++ unshift(@Tx,pop(@Tx));
++
++ &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
++ &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
++ &vmovdqu(@X[-4&7],"0($inp)"); # load input
++ &vmovdqu(@X[-3&7],"16($inp)");
++ &vmovdqu(@X[-2&7],"32($inp)");
++ &vmovdqu(@X[-1&7],"48($inp)");
++ &vpshufb(@X[-4&7], at X[-4&7], at X[2]); # byte swap
+ &add ($inp,64);
-+ &vpshufb(@X[-4&7], at X[-4&7], at X[2]); # byte swap
-+ &mov (&DWP(192+4,"esp"),$inp);
-+ &vmovdqa(&QWP(112-16,"esp"), at X[3]); # borrow last backtrace slot
+
+ $Xi=0;
+}
@@ -5873,15 +5727,15 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpshufb (@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
++ &vpshufb(@X[($Xi-3)&7], at X[($Xi-3)&7], at X[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vpaddd (@X[$Xi&7], at X[($Xi-4)&7], at X[3]);
++ &vpaddd (@X[$Xi&7], at X[($Xi-4)&7], at Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
-+ &vmovdqa (&QWP(0+16*$Xi,"esp"), at X[$Xi&7]); # X[]+K xfer to IALU
++ &vmovdqa(eval(16*$Xi)."(%rsp)", at X[$Xi&7]); # X[]+K xfer to IALU
+ eval(shift(@insns));
+ eval(shift(@insns));
+
@@ -5898,7 +5752,10 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ foreach (@insns) { eval; }
+}
+
-+&set_label("loop",16);
++$code.=<<___;
++.align 16
++.Loop_avx:
++___
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
+ &Xupdate_avx_16_31(\&body_00_19);
@@ -5923,174 +5780,199 @@ diff -up openssl-1.0.0d/crypto/sha/asm/sha1-586.pl.intelopts openssl-1.0.0d/cryp
+ &Xloop_avx(\&body_20_39);
+ &Xloop_avx(\&body_20_39);
+
-+ &mov (@T[1],&DWP(192,"esp")); # update context
-+ &add ($A,&DWP(0, at T[1]));
-+ &add (@T[0],&DWP(4, at T[1])); # $b
-+ &add ($C,&DWP(8, at T[1]));
-+ &mov (&DWP(0, at T[1]),$A);
-+ &add ($D,&DWP(12, at T[1]));
-+ &mov (&DWP(4, at T[1]), at T[0]);
-+ &add ($E,&DWP(16, at T[1]));
-+ &mov (&DWP(8, at T[1]),$C);
-+ &mov ($B, at T[0]);
-+ &mov (&DWP(12, at T[1]),$D);
-+ &mov (&DWP(16, at T[1]),$E);
-+
-+ &jmp (&label("loop"));
++$code.=<<___;
++ add 0($ctx),$A # update context
++ add 4($ctx), at T[0]
++ add 8($ctx),$C
++ add 12($ctx),$D
++ mov $A,0($ctx)
++ add 16($ctx),$E
++ mov @T[0],4($ctx)
++ mov @T[0],$B # magic seed
++ mov $C,8($ctx)
++ mov $D,12($ctx)
++ mov $E,16($ctx)
++ jmp .Loop_avx
+
-+&set_label("done",16); $j=$saved_j; @V=@saved_V;
++.align 16
++.Ldone_avx:
++___
++ $j=$saved_j; @V=@saved_V;
+
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+ &Xtail_avx(\&body_20_39);
+
-+ &vzeroall();
++$code.=<<___;
++ vzeroall
+
-+ &mov (@T[1],&DWP(192,"esp")); # update context
-+ &add ($A,&DWP(0, at T[1]));
-+ &mov ("esp",&DWP(192+12,"esp")); # restore %esp
-+ &add (@T[0],&DWP(4, at T[1])); # $b
-+ &add ($C,&DWP(8, at T[1]));
-+ &mov (&DWP(0, at T[1]),$A);
-+ &add ($D,&DWP(12, at T[1]));
-+ &mov (&DWP(4, at T[1]), at T[0]);
-+ &add ($E,&DWP(16, at T[1]));
-+ &mov (&DWP(8, at T[1]),$C);
-+ &mov (&DWP(12, at T[1]),$D);
-+ &mov (&DWP(16, at T[1]),$E);
-+&function_end("_sha1_block_data_order_avx");
-+}
-+&set_label("K_XX_XX",64);
-+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19
-+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39
-+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59
-+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79
-+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask
++ add 0($ctx),$A # update context
++ add 4($ctx), at T[0]
++ add 8($ctx),$C
++ mov $A,0($ctx)
++ add 12($ctx),$D
++ mov @T[0],4($ctx)
++ add 16($ctx),$E
++ mov $C,8($ctx)
++ mov $D,12($ctx)
++ mov $E,16($ctx)
++___
++$code.=<<___ if ($win64);
++ movaps 64+0(%rsp),%xmm6
++ movaps 64+16(%rsp),%xmm7
++ movaps 64+32(%rsp),%xmm8
++ movaps 64+48(%rsp),%xmm9
++ movaps 64+64(%rsp),%xmm10
++___
++$code.=<<___;
++ lea `64+($win64?6*16:0)`(%rsp),%rsi
++ mov 0(%rsi),%r12
++ mov 8(%rsi),%rbp
++ mov 16(%rsi),%rbx
++ lea 24(%rsi),%rsp
++.Lepilogue_avx:
++ ret
++.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
++___
+}
- &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
++$code.=<<___;
++.align 64
++K_XX_XX:
++.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
++.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
++.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
++.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
++.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
++___
++}}}
++$code.=<<___;
++.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.align 64
+ ___
- &asm_finish();
-diff -up openssl-1.0.0d/crypto/x86cpuid.pl.intelopts openssl-1.0.0d/crypto/x86cpuid.pl
---- openssl-1.0.0d/crypto/x86cpuid.pl.intelopts 2010-02-12 18:02:12.000000000 +0100
-+++ openssl-1.0.0d/crypto/x86cpuid.pl 2011-11-03 09:55:42.000000000 +0100
-@@ -1,4 +1,4 @@
--#!/usr/bin/env perl
-+#!/usr/bin/perl
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+@@ -272,25 +1109,73 @@ se_handler:
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- push(@INC, "${dir}perlasm", "perlasm");
-@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- &pop ("eax");
- &xor ("ecx","eax");
- &bt ("ecx",21);
-- &jnc (&label("done"));
-+ &jnc (&label("generic"));
- &xor ("eax","eax");
- &cpuid ();
- &mov ("edi","eax"); # max value for standard query level
-@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- # AMD specific
- &mov ("eax",0x80000000);
- &cpuid ();
-- &cmp ("eax",0x80000008);
-+ &cmp ("eax",0x80000001);
-+ &jb (&label("intel"));
-+ &mov ("esi","eax");
-+ &mov ("eax",0x80000001);
-+ &cpuid ();
-+ &or ("ebp","ecx");
-+ &and ("ebp",1<<11|1); # isolate XOP bit
-+ &cmp ("esi",0x80000008);
- &jb (&label("intel"));
+ lea .Lprologue(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+- jb .Lin_prologue
++ jb .Lcommon_seh_tail
- &mov ("eax",0x80000008);
-@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- &mov ("eax",1);
- &cpuid ();
- &bt ("edx",28);
-- &jnc (&label("done"));
-+ &jnc (&label("generic"));
- &shr ("ebx",16);
- &and ("ebx",0xff);
- &cmp ("ebx","esi");
-- &ja (&label("done"));
-+ &ja (&label("generic"));
- &and ("edx",0xefffffff); # clear hyper-threading bit
-- &jmp (&label("done"));
-+ &jmp (&label("generic"));
-
- &set_label("intel");
- &cmp ("edi",4);
-@@ -85,27 +92,45 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- &set_label("nocacheinfo");
- &mov ("eax",1);
- &cpuid ();
-+ &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0
- &cmp ("ebp",0);
-- &jne (&label("notP4"));
-+ &jne (&label("notintel"));
-+ &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs
- &and (&HB("eax"),15); # familiy ID
- &cmp (&HB("eax"),15); # P4?
-- &jne (&label("notP4"));
-- &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
--&set_label("notP4");
-+ &jne (&label("notintel"));
-+ &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR
-+&set_label("notintel");
- &bt ("edx",28); # test hyper-threading bit
-- &jnc (&label("done"));
-+ &jnc (&label("generic"));
- &and ("edx",0xefffffff);
- &cmp ("edi",0);
-- &je (&label("done"));
-+ &je (&label("generic"));
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+- jae .Lin_prologue
++ jae .Lcommon_seh_tail
- &or ("edx",0x10000000);
- &shr ("ebx",16);
- &cmp (&LB("ebx"),1);
-- &ja (&label("done"));
-+ &ja (&label("generic"));
- &and ("edx",0xefffffff); # clear hyper-threading bit if not
+ mov `16*4`(%rax),%rax # pull saved stack pointer
+- lea 24(%rax),%rax
++ lea 32(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
++ mov -32(%rax),%r13
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
++ mov %r13,224($context) # restore context->R13
+
-+&set_label("generic");
-+ &and ("ebp",1<<11); # isolate AMD XOP flag
-+ &and ("ecx",0xfffff7ff); # force 11th bit to 0
-+ &mov ("esi","edx");
-+ &or ("ebp","ecx"); # merge AMD XOP flag
++ jmp .Lcommon_seh_tail
++.size se_handler,.-se_handler
+
-+ &bt ("ecx",27); # check OSXSAVE bit
-+ &jnc (&label("clear_avx"));
-+ &xor ("ecx","ecx"); # XCR0
-+ &data_byte(0x0f,0x01,0xd0); # xgetbv
-+ &and ("eax",6); # isolate XMM and YMM state support
-+ &cmp ("eax",6);
-+ &je (&label("done"));
-+&set_label("clear_avx");
-+ &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
- &set_label("done");
-- &mov ("eax","edx");
-- &mov ("edx","ecx");
-+ &mov ("eax","esi");
-+ &mov ("edx","ebp");
- &function_end("OPENSSL_ia32_cpuid");
++.type ssse3_handler,\@abi-omnipotent
++.align 16
++ssse3_handler:
++ push %rsi
++ push %rdi
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ push %r14
++ push %r15
++ pushfq
++ sub \$64,%rsp
++
++ mov 120($context),%rax # pull context->Rax
++ mov 248($context),%rbx # pull context->Rip
++
++ mov 8($disp),%rsi # disp->ImageBase
++ mov 56($disp),%r11 # disp->HandlerData
++
++ mov 0(%r11),%r10d # HandlerData[0]
++ lea (%rsi,%r10),%r10 # prologue label
++ cmp %r10,%rbx # context->Rip<prologue label
++ jb .Lcommon_seh_tail
++
++ mov 152($context),%rax # pull context->Rsp
- &external_label("OPENSSL_ia32cap_P");
-@@ -199,8 +224,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
- &bt (&DWP(0,"ecx"),1);
- &jnc (&label("no_x87"));
- if ($sse2) {
-- &bt (&DWP(0,"ecx"),26);
-- &jnc (&label("no_sse2"));
-+ &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
-+ &cmp ("ecx",1<<26|1<<24);
-+ &jne (&label("no_sse2"));
- &pxor ("xmm0","xmm0");
- &pxor ("xmm1","xmm1");
- &pxor ("xmm2","xmm2");
-diff -up openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0d/crypto/x86_64cpuid.pl
---- openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts 2010-04-14 21:25:09.000000000 +0200
-+++ openssl-1.0.0d/crypto/x86_64cpuid.pl 2011-08-24 12:50:56.000000000 +0200
+-.Lin_prologue:
++ mov 4(%r11),%r10d # HandlerData[1]
++ lea (%rsi,%r10),%r10 # epilogue label
++ cmp %r10,%rbx # context->Rip>=epilogue label
++ jae .Lcommon_seh_tail
++
++ lea 64(%rax),%rsi
++ lea 512($context),%rdi # &context.Xmm6
++ mov \$10,%ecx
++ .long 0xa548f3fc # cld; rep movsq
++ lea 24+5*16(%rax),%rax # adjust stack pointer
++
++ mov -8(%rax),%rbx
++ mov -16(%rax),%rbp
++ mov %rbx,144($context) # restore context->Rbx
++ mov %rbp,160($context) # restore context->Rbp
++
++.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+@@ -328,19 +1213,38 @@ se_handler:
+ pop %rdi
+ pop %rsi
+ ret
+-.size se_handler,.-se_handler
++.size ssse3_handler,.-ssse3_handler
+
+ .section .pdata
+ .align 4
+ .rva .LSEH_begin_sha1_block_data_order
+ .rva .LSEH_end_sha1_block_data_order
+ .rva .LSEH_info_sha1_block_data_order
+-
++ .rva .LSEH_begin_sha1_block_data_order_ssse3
++ .rva .LSEH_end_sha1_block_data_order_ssse3
++ .rva .LSEH_info_sha1_block_data_order_ssse3
++___
++$code.=<<___ if ($avx);
++ .rva .LSEH_begin_sha1_block_data_order_avx
++ .rva .LSEH_end_sha1_block_data_order_avx
++ .rva .LSEH_info_sha1_block_data_order_avx
++___
++$code.=<<___;
+ .section .xdata
+ .align 8
+ .LSEH_info_sha1_block_data_order:
+ .byte 9,0,0,0
+ .rva se_handler
++.LSEH_info_sha1_block_data_order_ssse3:
++ .byte 9,0,0,0
++ .rva ssse3_handler
++ .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
++___
++$code.=<<___ if ($avx);
++.LSEH_info_sha1_block_data_order_avx:
++ .byte 9,0,0,0
++ .rva ssse3_handler
++ .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
+ ___
+ }
+
+diff -up openssl-1.0.0k/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0k/crypto/x86_64cpuid.pl
+--- openssl-1.0.0k/crypto/x86_64cpuid.pl.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/x86_64cpuid.pl 2013-02-19 21:21:59.833360113 +0100
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl
+#!/usr/bin/perl
@@ -6101,12 +5983,12 @@ diff -up openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0d/crypto/x8
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
--open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+-open STDOUT,"| \"$^X\" ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
-+open STDOUT,"| $^X $xlate $flavour $output";
++open STDOUT,"| \"$^X\" $xlate $flavour $output";
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
@@ -6219,3 +6101,121 @@ diff -up openssl-1.0.0d/crypto/x86_64cpuid.pl.intelopts openssl-1.0.0d/crypto/x8
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+diff -up openssl-1.0.0k/crypto/x86cpuid.pl.intelopts openssl-1.0.0k/crypto/x86cpuid.pl
+--- openssl-1.0.0k/crypto/x86cpuid.pl.intelopts 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/x86cpuid.pl 2013-02-19 21:15:39.634408163 +0100
+@@ -1,4 +1,4 @@
+-#!/usr/bin/env perl
++#!/usr/bin/perl
+
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ push(@INC, "${dir}perlasm", "perlasm");
+@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ &pop ("eax");
+ &xor ("ecx","eax");
+ &bt ("ecx",21);
+- &jnc (&label("done"));
++ &jnc (&label("generic"));
+ &xor ("eax","eax");
+ &cpuid ();
+ &mov ("edi","eax"); # max value for standard query level
+@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ # AMD specific
+ &mov ("eax",0x80000000);
+ &cpuid ();
+- &cmp ("eax",0x80000008);
++ &cmp ("eax",0x80000001);
++ &jb (&label("intel"));
++ &mov ("esi","eax");
++ &mov ("eax",0x80000001);
++ &cpuid ();
++ &or ("ebp","ecx");
++ &and ("ebp",1<<11|1); # isolate XOP bit
++ &cmp ("esi",0x80000008);
+ &jb (&label("intel"));
+
+ &mov ("eax",0x80000008);
+@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ &mov ("eax",1);
+ &cpuid ();
+ &bt ("edx",28);
+- &jnc (&label("done"));
++ &jnc (&label("generic"));
+ &shr ("ebx",16);
+ &and ("ebx",0xff);
+ &cmp ("ebx","esi");
+- &ja (&label("done"));
++ &ja (&label("generic"));
+ &and ("edx",0xefffffff); # clear hyper-threading bit
+- &jmp (&label("done"));
++ &jmp (&label("generic"));
+
+ &set_label("intel");
+ &cmp ("edi",4);
+@@ -85,27 +92,45 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ &set_label("nocacheinfo");
+ &mov ("eax",1);
+ &cpuid ();
++ &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0
+ &cmp ("ebp",0);
+- &jne (&label("notP4"));
++ &jne (&label("notintel"));
++ &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs
+ &and (&HB("eax"),15); # familiy ID
+ &cmp (&HB("eax"),15); # P4?
+- &jne (&label("notP4"));
+- &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
+-&set_label("notP4");
++ &jne (&label("notintel"));
++ &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR
++&set_label("notintel");
+ &bt ("edx",28); # test hyper-threading bit
+- &jnc (&label("done"));
++ &jnc (&label("generic"));
+ &and ("edx",0xefffffff);
+ &cmp ("edi",0);
+- &je (&label("done"));
++ &je (&label("generic"));
+
+ &or ("edx",0x10000000);
+ &shr ("ebx",16);
+ &cmp (&LB("ebx"),1);
+- &ja (&label("done"));
++ &ja (&label("generic"));
+ &and ("edx",0xefffffff); # clear hyper-threading bit if not
++
++&set_label("generic");
++ &and ("ebp",1<<11); # isolate AMD XOP flag
++ &and ("ecx",0xfffff7ff); # force 11th bit to 0
++ &mov ("esi","edx");
++ &or ("ebp","ecx"); # merge AMD XOP flag
++
++ &bt ("ecx",27); # check OSXSAVE bit
++ &jnc (&label("clear_avx"));
++ &xor ("ecx","ecx"); # XCR0
++ &data_byte(0x0f,0x01,0xd0); # xgetbv
++ &and ("eax",6); # isolate XMM and YMM state support
++ &cmp ("eax",6);
++ &je (&label("done"));
++&set_label("clear_avx");
++ &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
+ &set_label("done");
+- &mov ("eax","edx");
+- &mov ("edx","ecx");
++ &mov ("eax","esi");
++ &mov ("edx","ebp");
+ &function_end("OPENSSL_ia32_cpuid");
+
+ &external_label("OPENSSL_ia32cap_P");
+@@ -199,8 +224,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA3
+ &bt (&DWP(0,"ecx"),1);
+ &jnc (&label("no_x87"));
+ if ($sse2) {
+- &bt (&DWP(0,"ecx"),26);
+- &jnc (&label("no_sse2"));
++ &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
++ &cmp ("ecx",1<<26|1<<24);
++ &jne (&label("no_sse2"));
+ &pxor ("xmm0","xmm0");
+ &pxor ("xmm1","xmm1");
+ &pxor ("xmm2","xmm2");
diff --git a/openssl-1.0.0k-secure-getenv.patch b/openssl-1.0.0k-secure-getenv.patch
new file mode 100644
index 0000000..3014b46
--- /dev/null
+++ b/openssl-1.0.0k-secure-getenv.patch
@@ -0,0 +1,154 @@
+diff -up openssl-1.0.0k/crypto/conf/conf_api.c.secure-getenv openssl-1.0.0k/crypto/conf/conf_api.c
+--- openssl-1.0.0k/crypto/conf/conf_api.c.secure-getenv 2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/conf/conf_api.c 2013-02-19 21:25:56.623198152 +0100
+@@ -142,7 +142,7 @@ char *_CONF_get_string(const CONF *conf,
+ if (v != NULL) return(v->value);
+ if (strcmp(section,"ENV") == 0)
+ {
+- p=getenv(name);
++ p=__secure_getenv(name);
+ if (p != NULL) return(p);
+ }
+ }
+@@ -155,7 +155,7 @@ char *_CONF_get_string(const CONF *conf,
+ return(NULL);
+ }
+ else
+- return(getenv(name));
++ return (__secure_getenv(name));
+ }
+
+ #if 0 /* There's no way to provide error checking with this function, so
+diff -up openssl-1.0.0k/crypto/conf/conf_mod.c.secure-getenv openssl-1.0.0k/crypto/conf/conf_mod.c
+--- openssl-1.0.0k/crypto/conf/conf_mod.c.secure-getenv 2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/conf/conf_mod.c 2013-02-19 21:25:56.624198172 +0100
+@@ -548,8 +548,8 @@ char *CONF_get1_default_config_file(void
+ char *file;
+ int len;
+
+- file = getenv("OPENSSL_CONF");
+- if (file)
++ file = __secure_getenv("OPENSSL_CONF");
++ if (file)
+ return BUF_strdup(file);
+
+ len = strlen(X509_get_default_cert_area());
+diff -up openssl-1.0.0k/crypto/engine/eng_list.c.secure-getenv openssl-1.0.0k/crypto/engine/eng_list.c
+--- openssl-1.0.0k/crypto/engine/eng_list.c.secure-getenv 2013-02-05 12:47:28.000000000 +0100
++++ openssl-1.0.0k/crypto/engine/eng_list.c 2013-02-19 21:25:56.625198193 +0100
+@@ -399,9 +399,9 @@ ENGINE *ENGINE_by_id(const char *id)
+ if (strcmp(id, "dynamic"))
+ {
+ #ifdef OPENSSL_SYS_VMS
+- if((load_dir = getenv("OPENSSL_ENGINES")) == 0) load_dir = "SSLROOT:[ENGINES]";
++ if(OPENSSL_issetugid() || (load_dir = getenv("OPENSSL_ENGINES")) == 0) load_dir = "SSLROOT:[ENGINES]";
+ #else
+- if((load_dir = getenv("OPENSSL_ENGINES")) == 0) load_dir = ENGINESDIR;
++ if((load_dir = __secure_getenv("OPENSSL_ENGINES")) == 0) load_dir = ENGINESDIR;
+ #endif
+ iterator = ENGINE_by_id("dynamic");
+ if(!iterator || !ENGINE_ctrl_cmd_string(iterator, "ID", id, 0) ||
+diff -up openssl-1.0.0k/crypto/md5/md5_dgst.c.secure-getenv openssl-1.0.0k/crypto/md5/md5_dgst.c
+--- openssl-1.0.0k/crypto/md5/md5_dgst.c.secure-getenv 2013-02-19 21:25:56.000000000 +0100
++++ openssl-1.0.0k/crypto/md5/md5_dgst.c 2013-02-19 21:27:02.814550574 +0100
+@@ -78,7 +78,7 @@ const char MD5_version[]="MD5" OPENSSL_V
+ int MD5_Init(MD5_CTX *c)
+ #ifdef OPENSSL_FIPS
+ {
+- if (FIPS_mode() && getenv("OPENSSL_FIPS_NON_APPROVED_MD5_ALLOW") == NULL)
++ if (FIPS_mode() && __secure_getenv("OPENSSL_FIPS_NON_APPROVED_MD5_ALLOW") == NULL)
+ FIPS_BAD_ALGORITHM(alg)
+ return private_MD5_Init(c);
+ }
+diff -up openssl-1.0.0k/crypto/o_init.c.secure-getenv openssl-1.0.0k/crypto/o_init.c
+--- openssl-1.0.0k/crypto/o_init.c.secure-getenv 2013-02-19 21:25:56.491195456 +0100
++++ openssl-1.0.0k/crypto/o_init.c 2013-02-19 21:25:56.628198256 +0100
+@@ -75,7 +75,7 @@ static void init_fips_mode(void)
+ char buf[2] = "0";
+ int fd;
+
+- if (getenv("OPENSSL_FORCE_FIPS_MODE") != NULL)
++ if (__secure_getenv("OPENSSL_FORCE_FIPS_MODE") != NULL)
+ {
+ buf[0] = '1';
+ }
+diff -up openssl-1.0.0k/crypto/rand/randfile.c.secure-getenv openssl-1.0.0k/crypto/rand/randfile.c
+--- openssl-1.0.0k/crypto/rand/randfile.c.secure-getenv 2013-02-05 12:58:46.000000000 +0100
++++ openssl-1.0.0k/crypto/rand/randfile.c 2013-02-19 21:25:56.630198296 +0100
+@@ -275,8 +275,7 @@ const char *RAND_file_name(char *buf, si
+ struct stat sb;
+ #endif
+
+- if (OPENSSL_issetugid() == 0)
+- s=getenv("RANDFILE");
++ s=__secure_getenv("RANDFILE");
+ if (s != NULL && *s && strlen(s) + 1 < size)
+ {
+ if (BUF_strlcpy(buf,s,size) >= size)
+@@ -284,8 +283,7 @@ const char *RAND_file_name(char *buf, si
+ }
+ else
+ {
+- if (OPENSSL_issetugid() == 0)
+- s=getenv("HOME");
++ s=__secure_getenv("HOME");
+ #ifdef DEFAULT_HOME
+ if (s == NULL)
+ {
+diff -up openssl-1.0.0k/crypto/x509/by_dir.c.secure-getenv openssl-1.0.0k/crypto/x509/by_dir.c
+--- openssl-1.0.0k/crypto/x509/by_dir.c.secure-getenv 2013-02-05 12:47:29.000000000 +0100
++++ openssl-1.0.0k/crypto/x509/by_dir.c 2013-02-19 21:25:56.638198460 +0100
+@@ -135,7 +135,7 @@ static int dir_ctrl(X509_LOOKUP *ctx, in
+ case X509_L_ADD_DIR:
+ if (argl == X509_FILETYPE_DEFAULT)
+ {
+- dir=(char *)getenv(X509_get_default_cert_dir_env());
++ dir=(char *)__secure_getenv(X509_get_default_cert_dir_env());
+ if (dir)
+ ret=add_cert_dir(ld,dir,X509_FILETYPE_PEM);
+ else
+diff -up openssl-1.0.0k/crypto/x509/by_file.c.secure-getenv openssl-1.0.0k/crypto/x509/by_file.c
+--- openssl-1.0.0k/crypto/x509/by_file.c.secure-getenv 2013-02-19 21:25:56.431194229 +0100
++++ openssl-1.0.0k/crypto/x509/by_file.c 2013-02-19 21:25:56.639198480 +0100
+@@ -100,7 +100,7 @@ static int by_file_ctrl(X509_LOOKUP *ctx
+ case X509_L_FILE_LOAD:
+ if (argl == X509_FILETYPE_DEFAULT)
+ {
+- file = (char *)getenv(X509_get_default_cert_file_env());
++ file = (char *)__secure_getenv(X509_get_default_cert_file_env());
+ if (file)
+ ok = (X509_load_cert_crl_file(ctx,file,
+ X509_FILETYPE_PEM) != 0);
+diff -up openssl-1.0.0k/crypto/x509/x509_vfy.c.secure-getenv openssl-1.0.0k/crypto/x509/x509_vfy.c
+--- openssl-1.0.0k/crypto/x509/x509_vfy.c.secure-getenv 2013-02-05 12:47:29.000000000 +0100
++++ openssl-1.0.0k/crypto/x509/x509_vfy.c 2013-02-19 21:25:56.642198540 +0100
+@@ -481,7 +481,7 @@ static int check_chain_extensions(X509_S
+ !!(ctx->param->flags & X509_V_FLAG_ALLOW_PROXY_CERTS);
+ /* A hack to keep people who don't want to modify their
+ software happy */
+- if (getenv("OPENSSL_ALLOW_PROXY_CERTS"))
++ if (__secure_getenv("OPENSSL_ALLOW_PROXY_CERTS"))
+ allow_proxy_certs = 1;
+ purpose = ctx->param->purpose;
+ }
+diff -up openssl-1.0.0k/engines/ccgost/gost_ctl.c.secure-getenv openssl-1.0.0k/engines/ccgost/gost_ctl.c
+--- openssl-1.0.0k/engines/ccgost/gost_ctl.c.secure-getenv 2013-02-05 12:47:29.000000000 +0100
++++ openssl-1.0.0k/engines/ccgost/gost_ctl.c 2013-02-19 21:25:56.643198560 +0100
+@@ -65,7 +65,7 @@ const char *get_gost_engine_param(int pa
+ {
+ return gost_params[param];
+ }
+- tmp = getenv(gost_envnames[param]);
++ tmp = __secure_getenv(gost_envnames[param]);
+ if (tmp)
+ {
+ if (gost_params[param]) OPENSSL_free(gost_params[param]);
+@@ -79,7 +79,7 @@ int gost_set_default_param(int param, co
+ {
+ const char *tmp;
+ if (param <0 || param >GOST_PARAM_MAX) return 0;
+- tmp = getenv(gost_envnames[param]);
++ tmp = __secure_getenv(gost_envnames[param]);
+ /* if there is value in the environment, use it, else -passed string * */
+ if (!tmp) tmp=value;
+ if (gost_params[param]) OPENSSL_free(gost_params[param]);
diff --git a/openssl-1.0.0k-version.patch b/openssl-1.0.0k-version.patch
new file mode 100644
index 0000000..f08fab4
--- /dev/null
+++ b/openssl-1.0.0k-version.patch
@@ -0,0 +1,21 @@
+diff -up openssl-1.0.0k/crypto/opensslv.h.version openssl-1.0.0k/crypto/opensslv.h
+--- openssl-1.0.0k/crypto/opensslv.h.version 2013-02-19 21:12:26.903472656 +0100
++++ openssl-1.0.0k/crypto/opensslv.h 2013-02-19 21:14:35.613100870 +0100
+@@ -25,7 +25,7 @@
+ * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
+ * major minor fix final patch/beta)
+ */
+-#define OPENSSL_VERSION_NUMBER 0x100000bfL
++#define OPENSSL_VERSION_NUMBER 0x10000003L
+ #ifdef OPENSSL_FIPS
+ #define OPENSSL_VERSION_TEXT "OpenSSL 1.0.0k-fips 5 Feb 2013"
+ #else
+@@ -83,7 +83,7 @@
+ * should only keep the versions that are binary compatible with the current.
+ */
+ #define SHLIB_VERSION_HISTORY ""
+-#define SHLIB_VERSION_NUMBER "1.0.0"
++#define SHLIB_VERSION_NUMBER "1.0.0k"
+
+
+ #endif /* HEADER_OPENSSLV_H */
diff --git a/openssl-1.0.1e-env-zlib.patch b/openssl-1.0.1e-env-zlib.patch
new file mode 100644
index 0000000..297d3a3
--- /dev/null
+++ b/openssl-1.0.1e-env-zlib.patch
@@ -0,0 +1,29 @@
+diff -up openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod.env-zlib openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod
+--- openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod.env-zlib 2013-02-11 16:02:48.000000000 +0100
++++ openssl-1.0.1e/doc/ssl/SSL_COMP_add_compression_method.pod 2013-02-19 16:32:51.000000000 +0100
+@@ -47,6 +47,13 @@ Once the identities of the compression m
+ been standardized, the compression API will most likely be changed. Using
+ it in the current state is not recommended.
+
++It is also not recommended to use compression if data transfered contain
++untrusted parts that can be manipulated by an attacker as he could then
++get information about the encrypted data. See the CRIME attack. For
++that reason the default loading of the zlib compression method is
++disabled and enabled only if the environment variable B<OPENSSL_DEFAULT_ZLIB>
++is present during the library initialization.
++
+ =head1 RETURN VALUES
+
+ SSL_COMP_add_compression_method() may return the following values:
+diff -up openssl-1.0.1e/ssl/ssl_ciph.c.env-zlib openssl-1.0.1e/ssl/ssl_ciph.c
+--- openssl-1.0.1e/ssl/ssl_ciph.c.env-zlib 2013-02-11 16:26:04.000000000 +0100
++++ openssl-1.0.1e/ssl/ssl_ciph.c 2013-02-19 16:37:36.163545085 +0100
+@@ -455,7 +455,7 @@ static void load_builtin_compressions(vo
+
+ MemCheck_off();
+ ssl_comp_methods=sk_SSL_COMP_new(sk_comp_cmp);
+- if (ssl_comp_methods != NULL)
++ if (ssl_comp_methods != NULL && __secure_getenv("OPENSSL_DEFAULT_ZLIB") != NULL)
+ {
+ comp=(SSL_COMP *)OPENSSL_malloc(sizeof(SSL_COMP));
+ if (comp != NULL)
diff --git a/openssl.spec b/openssl.spec
index c03cc34..14d35a9 100644
--- a/openssl.spec
+++ b/openssl.spec
@@ -20,8 +20,8 @@
Summary: A general purpose cryptography library with TLS implementation
Name: openssl
-Version: 1.0.0j
-Release: 2%{?dist}
+Version: 1.0.0k
+Release: 1%{?dist}
Epoch: 1
# We have to remove certain patented algorithms from the openssl source
# tarball with the hobble-openssl script which is included below.
@@ -52,15 +52,15 @@ Patch34: openssl-0.9.6-x509.patch
Patch35: openssl-0.9.8j-version-add-engines.patch
Patch38: openssl-1.0.0-beta5-cipher-change.patch
Patch39: openssl-1.0.0b-ipv6-apps.patch
-Patch40: openssl-1.0.0f-fips.patch
+Patch40: openssl-1.0.0k-fips.patch
Patch41: openssl-1.0.0-beta3-fipscheck.patch
Patch43: openssl-1.0.0a-fipsmode.patch
Patch44: openssl-1.0.0-beta3-fipsrng.patch
-Patch45: openssl-0.9.8j-env-nozlib.patch
+Patch45: openssl-1.0.1e-env-zlib.patch
Patch47: openssl-1.0.0-beta5-readme-warning.patch
Patch49: openssl-1.0.1a-algo-doc.patch
Patch50: openssl-1.0.0-beta4-dtls1-abi.patch
-Patch51: openssl-1.0.0j-version.patch
+Patch51: openssl-1.0.0k-version.patch
Patch52: openssl-1.0.0b-aesni.patch
Patch53: openssl-1.0.0-name-hash.patch
Patch54: openssl-1.0.0c-speed-fips.patch
@@ -73,11 +73,13 @@ Patch60: openssl-1.0.0d-apps-dgst.patch
Patch61: openssl-1.0.0d-cavs.patch
Patch62: openssl-1.0.0-fips-aesni.patch
Patch63: openssl-1.0.0d-xmpp-starttls.patch
-Patch64: openssl-1.0.0d-intelopts.patch
+Patch64: openssl-1.0.0k-intelopts.patch
Patch65: openssl-1.0.0e-chil-fixes.patch
Patch66: openssl-1.0.0-sha2test.patch
+Patch67: openssl-1.0.0k-secure-getenv.patch
# Backported fixes including security fixes
Patch81: openssl-1.0.0d-padlock64.patch
+Patch82: openssl-1.0.0k-backports.patch
License: OpenSSL
Group: System Environment/Libraries
@@ -153,7 +155,7 @@ from other formats to the formats used by the OpenSSL toolkit.
%patch41 -p1 -b .fipscheck
%patch43 -p1 -b .fipsmode
%patch44 -p1 -b .fipsrng
-%patch45 -p1 -b .env-nozlib
+%patch45 -p1 -b .env-zlib
%patch47 -p1 -b .warning
%patch49 -p1 -b .algo-doc
%patch50 -p1 -b .dtls1-abi
@@ -173,8 +175,10 @@ from other formats to the formats used by the OpenSSL toolkit.
%patch64 -p1 -b .intelopts
%patch65 -p1 -b .chil
%patch66 -p1 -b .sha2test
+%patch67 -p1 -b .secure-getenv
%patch81 -p1 -b .padlock64
+%patch82 -p1 -b .backports
# Modify the various perl scripts to reference perl in the right location.
perl util/perlpath.pl `dirname %{__perl}`
@@ -424,6 +428,9 @@ rm -rf $RPM_BUILD_ROOT/%{_libdir}/fipscanister.*
%postun -p /sbin/ldconfig
%changelog
+* Tue Feb 19 2013 Tomas Mraz <tmraz at redhat.com> 1.0.0k-1
+- new upstream release fixing multiple CVEs
+
* Thu Jul 12 2012 Tomas Mraz <tmraz at redhat.com> 1.0.0j-2
- fix s_server with new glibc when no global IPv6 address (#839031)
diff --git a/sources b/sources
index 4a4e3bd..79c0f0b 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-f6eff5c8ba4db07d702163ba2f37757c openssl-1.0.0j-usa.tar.xz
+a8109e845ff32b19fd928f7dfbcebf66 openssl-1.0.0k-usa.tar.xz
More information about the scm-commits
mailing list