--- /dev/null
+From 9b840c2ddfdae96f078888706cb2574333699e06 Mon Sep 17 00:00:00 2001
+From: Yousong Zhou <yszhou4tech@gmail.com>
+Date: Tue, 11 Aug 2020 16:16:39 +0800
+Subject: [PATCH] Revert "Remove x86/x86_64 BSAES and AES_ASM support"
+
+This reverts commit 87bea6550ae0dda7c40937cff2e86cc2b0b09491.
+---
+ Configurations/00-base-templates.conf | 4 +-
+ crypto/aes/asm/aes-586.pl | 3000 +++++++++++++++++++++++
+ crypto/aes/asm/aes-x86_64.pl | 2916 ++++++++++++++++++++++
+ crypto/aes/asm/bsaes-x86_64.pl | 3239 +++++++++++++++++++++++++
+ 4 files changed, 9157 insertions(+), 2 deletions(-)
+ create mode 100755 crypto/aes/asm/aes-586.pl
+ create mode 100755 crypto/aes/asm/aes-x86_64.pl
+ create mode 100644 crypto/aes/asm/bsaes-x86_64.pl
+
+diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf
+index e01dc63a8b..5fd995cb33 100644
+--- a/Configurations/00-base-templates.conf
++++ b/Configurations/00-base-templates.conf
+@@ -198,7 +198,7 @@ my %targets=(
+ bn_asm_src => "bn-586.s co-586.s x86-mont.s x86-gf2m.s",
+ ec_asm_src => "ecp_nistz256.c ecp_nistz256-x86.s",
+ des_asm_src => "des-586.s crypt586.s",
+- aes_asm_src => "aes_core.c aes_cbc.c vpaes-x86.s aesni-x86.s",
++ aes_asm_src => "aes-586.s vpaes-x86.s aesni-x86.s",
+ bf_asm_src => "bf-586.s",
+ md5_asm_src => "md5-586.s",
+ cast_asm_src => "cast-586.s",
+@@ -223,7 +223,7 @@ my %targets=(
+ cpuid_asm_src => "x86_64cpuid.s",
+ bn_asm_src => "asm/x86_64-gcc.c x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s rsaz_exp.c rsaz-x86_64.s rsaz-avx2.s",
+ ec_asm_src => "ecp_nistz256.c ecp_nistz256-x86_64.s x25519-x86_64.s",
+- aes_asm_src => "aes_core.c aes_cbc.c vpaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
++ aes_asm_src => "aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
+ md5_asm_src => "md5-x86_64.s",
+ sha1_asm_src => "sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s sha1-mb-x86_64.s sha256-mb-x86_64.s",
+ rc4_asm_src => "rc4-x86_64.s rc4-md5-x86_64.s",
+diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
+new file mode 100755
+index 0000000000..29059edf8b
+--- /dev/null
++++ b/crypto/aes/asm/aes-586.pl
+@@ -0,0 +1,3000 @@
++#! /usr/bin/env perl
++# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the OpenSSL license (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# Version 4.3.
++#
++# You might fail to appreciate this module performance from the first
++# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
++# to be *the* best Intel C compiler without -KPIC, performance appears
++# to be virtually identical... But try to re-configure with shared
++# library support... Aha! Intel compiler "suddenly" lags behind by 30%
++# [on P4, more on others]:-) And if compared to position-independent
++# code generated by GNU C, this code performs *more* than *twice* as
++# fast! Yes, all this buzz about PIC means that unlike other hand-
++# coded implementations, this one was explicitly designed to be safe
++# to use even in shared library context... This also means that this
++# code isn't necessarily absolutely fastest "ever," because in order
++# to achieve position independence an extra register has to be
++# off-loaded to stack, which affects the benchmark result.
++#
++# Special note about instruction choice. Do you recall RC4_INT code
++# performing poorly on P4? It might be the time to figure out why.
++# RC4_INT code implies effective address calculations in base+offset*4
++# form. Trouble is that it seems that offset scaling turned to be
++# critical path... At least eliminating scaling resulted in 2.8x RC4
++# performance improvement [as you might recall]. As AES code is hungry
++# for scaling too, I [try to] avoid the latter by favoring off-by-2
++# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
++#
++# As was shown by Dean Gaudet, the above note turned out to be
++# void. Performance improvement with off-by-2 shifts was observed on
++# intermediate implementation, which was spilling yet another register
++# to stack... Final offset*4 code below runs just a tad faster on P4,
++# but exhibits up to 10% improvement on other cores.
++#
++# Second version is "monolithic" replacement for aes_core.c, which in
++# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
++# This made it possible to implement little-endian variant of the
++# algorithm without modifying the base C code. Motivating factor for
++# the undertaken effort was that it appeared that in tight IA-32
++# register window little-endian flavor could achieve slightly higher
++# Instruction Level Parallelism, and it indeed resulted in up to 15%
++# better performance on most recent µ-archs...
++#
++# Third version adds AES_cbc_encrypt implementation, which resulted in
++# up to 40% performance improvement of CBC benchmark results. 40% was
++# observed on P4 core, where "overall" improvement coefficient, i.e. if
++# compared to PIC generated by GCC and in CBC mode, was observed to be
++# as large as 4x:-) CBC performance is virtually identical to ECB now
++# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
++# Opteron, because certain function prologues and epilogues are
++# effectively taken out of the loop...
++#
++# Version 3.2 implements compressed tables and prefetch of these tables
++# in CBC[!] mode. Former means that 3/4 of table references are now
++# misaligned, which unfortunately has negative impact on elder IA-32
++# implementations, Pentium suffered 30% penalty, PIII - 10%.
++#
++# Version 3.3 avoids L1 cache aliasing between stack frame and
++# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
++# latter is achieved by copying the key schedule to controlled place in
++# stack. This unfortunately has rather strong impact on small block CBC
++# performance, ~2x deterioration on 16-byte block if compared to 3.3.
++#
++# Version 3.5 checks if there is L1 cache aliasing between user-supplied
++# key schedule and S-boxes and abstains from copying the former if
++# there is no. This allows end-user to consciously retain small block
++# performance by aligning key schedule in specific manner.
++#
++# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
++#
++# Current ECB performance numbers for 128-bit key in CPU cycles per
++# processed byte [measure commonly used by AES benchmarkers] are:
++#
++# small footprint fully unrolled
++# P4 24 22
++# AMD K8 20 19
++# PIII 25 23
++# Pentium 81 78
++#
++# Version 3.7 reimplements outer rounds as "compact." Meaning that
++# first and last rounds reference compact 256 bytes S-box. This means
++# that first round consumes a lot more CPU cycles and that encrypt
++# and decrypt performance becomes asymmetric. Encrypt performance
++# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
++# aggressively pre-fetched.
++#
++# Version 4.0 effectively rolls back to 3.6 and instead implements
++# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
++# which use exclusively 256 byte S-box. These functions are to be
++# called in modes not concealing plain text, such as ECB, or when
++# we're asked to process smaller amount of data [or unconditionally
++# on hyper-threading CPU]. Currently it's called unconditionally from
++# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
++# still needs to be modified to switch between slower and faster
++# mode when appropriate... But in either case benchmark landscape
++# changes dramatically and below numbers are CPU cycles per processed
++# byte for 128-bit key.
++#
++# ECB encrypt ECB decrypt CBC large chunk
++# P4 52[54] 83[95] 23
++# AMD K8 46[41] 66[70] 18
++# PIII 41[50] 60[77] 24
++# Core 2 31[36] 45[64] 18.5
++# Atom 76[100] 96[138] 60
++# Pentium 115 150 77
++#
++# Version 4.1 switches to compact S-box even in key schedule setup.
++#
++# Version 4.2 prefetches compact S-box in every SSE round or in other
++# words every cache-line is *guaranteed* to be accessed within ~50
++# cycles window. Why just SSE? Because it's needed on hyper-threading
++# CPU! Which is also why it's prefetched with 64 byte stride. Best
++# part is that it has no negative effect on performance:-)
++#
++# Version 4.3 implements switch between compact and non-compact block
++# functions in AES_cbc_encrypt depending on how much data was asked
++# to be processed in one stroke.
++#
++######################################################################
++# Timing attacks are classified in two classes: synchronous when
++# attacker consciously initiates cryptographic operation and collects
++# timing data of various character afterwards, and asynchronous when
++# malicious code is executed on same CPU simultaneously with AES,
++# instruments itself and performs statistical analysis of this data.
++#
++# As far as synchronous attacks go the root to the AES timing
++# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
++# are referred to in single 128-bit block operation. Well, in C
++# implementation with 4 distinct tables it's actually as little as 40
++# references per 256 elements table, but anyway... Secondly, even
++# though S-box elements are clustered into smaller amount of cache-
++# lines, smaller than 160 and even 40, it turned out that for certain
++# plain-text pattern[s] or simply put chosen plain-text and given key
++# few cache-lines remain unaccessed during block operation. Now, if
++# attacker can figure out this access pattern, he can deduct the key
++# [or at least part of it]. The natural way to mitigate this kind of
++# attacks is to minimize the amount of cache-lines in S-box and/or
++# prefetch them to ensure that every one is accessed for more uniform
++# timing. But note that *if* plain-text was concealed in such way that
++# input to block function is distributed *uniformly*, then attack
++# wouldn't apply. Now note that some encryption modes, most notably
++# CBC, do mask the plain-text in this exact way [secure cipher output
++# is distributed uniformly]. Yes, one still might find input that
++# would reveal the information about given key, but if amount of
++# candidate inputs to be tried is larger than amount of possible key
++# combinations then attack becomes infeasible. This is why revised
++# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
++# of data is to be processed in one stroke. The current size limit of
++# 512 bytes is chosen to provide same [diminishingly low] probability
++# for cache-line to remain untouched in large chunk operation with
++# large S-box as for single block operation with compact S-box and
++# surely needs more careful consideration...
++#
++# As for asynchronous attacks. There are two flavours: attacker code
++# being interleaved with AES on hyper-threading CPU at *instruction*
++# level, and two processes time sharing single core. As for latter.
++# Two vectors. 1. Given that attacker process has higher priority,
++# yield execution to process performing AES just before timer fires
++# off the scheduler, immediately regain control of CPU and analyze the
++# cache state. For this attack to be efficient attacker would have to
++# effectively slow down the operation by several *orders* of magnitude,
++# by ratio of time slice to duration of handful of AES rounds, which
++# unlikely to remain unnoticed. Not to mention that this also means
++# that he would spend correspondingly more time to collect enough
++# statistical data to mount the attack. It's probably appropriate to
++# say that if adversary reckons that this attack is beneficial and
++# risks to be noticed, you probably have larger problems having him
++# mere opportunity. In other words suggested code design expects you
++# to preclude/mitigate this attack by overall system security design.
++# 2. Attacker manages to make his code interrupt driven. In order for
++# this kind of attack to be feasible, interrupt rate has to be high
++# enough, again comparable to duration of handful of AES rounds. But
++# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
++# generates interrupts at such raging rate...
++#
++# And now back to the former, hyper-threading CPU or more specifically
++# Intel P4. Recall that asynchronous attack implies that malicious
++# code instruments itself. And naturally instrumentation granularity
++# has be noticeably lower than duration of codepath accessing S-box.
++# Given that all cache-lines are accessed during that time that is.
++# Current implementation accesses *all* cache-lines within ~50 cycles
++# window, which is actually *less* than RDTSC latency on Intel P4!
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "x86asm.pl";
++
++$output = pop;
++open OUT,">$output";
++*STDOUT=*OUT;
++
++&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
++&static_label("AES_Te");
++&static_label("AES_Td");
++
++$s0="eax";
++$s1="ebx";
++$s2="ecx";
++$s3="edx";
++$key="edi";
++$acc="esi";
++$tbl="ebp";
++
++# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
++# by caller
++$__ra=&DWP(0,"esp"); # return address
++$__s0=&DWP(4,"esp"); # s0 backing store
++$__s1=&DWP(8,"esp"); # s1 backing store
++$__s2=&DWP(12,"esp"); # s2 backing store
++$__s3=&DWP(16,"esp"); # s3 backing store
++$__key=&DWP(20,"esp"); # pointer to key schedule
++$__end=&DWP(24,"esp"); # pointer to end of key schedule
++$__tbl=&DWP(28,"esp"); # %ebp backing store
++
++# stack frame layout in AES_[en|crypt] routines, which differs from
++# above by 4 and overlaps by %ebp backing store
++$_tbl=&DWP(24,"esp");
++$_esp=&DWP(28,"esp");
++
++sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
++
++$speed_limit=512; # chunks smaller than $speed_limit are
++ # processed with compact routine in CBC mode
++$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
++ # recent µ-archs], but ~5 times smaller!
++ # I favor compact code to minimize cache
++ # contention and in hope to "collect" 5% back
++ # in real-life applications...
++
++$vertical_spin=0; # shift "vertically" defaults to 0, because of
++ # its proof-of-concept status...
++# Note that there is no decvert(), as well as last encryption round is
++# performed with "horizontal" shifts. This is because this "vertical"
++# implementation [one which groups shifts on a given $s[i] to form a
++# "column," unlike "horizontal" one, which groups shifts on different
++# $s[i] to form a "row"] is work in progress. It was observed to run
++# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
++# whole 12% slower:-( So we face a trade-off... Shall it be resolved
++# some day? Till then the code is considered experimental and by
++# default remains dormant...
++
++sub encvert()
++{ my ($te,@s) = @_;
++ my ($v0,$v1) = ($acc,$key);
++
++ &mov ($v0,$s[3]); # copy s3
++ &mov (&DWP(4,"esp"),$s[2]); # save s2
++ &mov ($v1,$s[0]); # copy s0
++ &mov (&DWP(8,"esp"),$s[1]); # save s1
++
++ &movz ($s[2],&HB($s[0]));
++ &and ($s[0],0xFF);
++ &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
++ &shr ($v1,16);
++ &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
++ &movz ($s[1],&HB($v1));
++ &and ($v1,0xFF);
++ &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
++ &mov ($v1,$v0);
++ &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
++
++ &and ($v0,0xFF);
++ &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
++ &movz ($v0,&HB($v1));
++ &shr ($v1,16);
++ &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
++ &movz ($v0,&HB($v1));
++ &and ($v1,0xFF);
++ &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
++ &mov ($v1,&DWP(4,"esp")); # restore s2
++ &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
++
++ &mov ($v0,$v1);
++ &and ($v1,0xFF);
++ &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
++ &movz ($v1,&HB($v0));
++ &shr ($v0,16);
++ &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
++ &movz ($v1,&HB($v0));
++ &and ($v0,0xFF);
++ &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
++ &mov ($v0,&DWP(8,"esp")); # restore s1
++ &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
++
++ &mov ($v1,$v0);
++ &and ($v0,0xFF);
++ &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
++ &movz ($v0,&HB($v1));
++ &shr ($v1,16);
++ &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
++ &movz ($v0,&HB($v1));
++ &and ($v1,0xFF);
++ &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
++ &mov ($key,$__key); # reincarnate v1 as key
++ &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
++}
++
++# Another experimental routine, which features "horizontal spin," but
++# eliminates one reference to stack. Strangely enough runs slower...
++sub enchoriz()
++{ my ($v0,$v1) = ($key,$acc);
++
++ &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
++ &rotr ($s2,8); # 8,11,10, 9
++ &mov ($v1,&DWP(0,$te,$v0,8)); # 0
++ &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
++ &rotr ($s3,16); # 13,12,15,14
++ &xor ($v1,&DWP(3,$te,$v0,8)); # 5
++ &movz ($v0,&HB($s2)); # 8,11,10*, 9
++ &rotr ($s0,16); # 1, 0, 3, 2
++ &xor ($v1,&DWP(2,$te,$v0,8)); # 10
++ &movz ($v0,&HB($s3)); # 13,12,15*,14
++ &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
++ &mov ($__s0,$v1); # t[0] saved
++
++ &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
++ &shr ($s1,16); # -, -, 7, 6
++ &mov ($v1,&DWP(0,$te,$v0,8)); # 4
++ &movz ($v0,&LB($s3)); # 13,12,15,14*
++ &xor ($v1,&DWP(2,$te,$v0,8)); # 14
++ &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
++ &and ($s3,0xffff0000); # 13,12, -, -
++ &xor ($v1,&DWP(1,$te,$v0,8)); # 3
++ &movz ($v0,&LB($s2)); # 8,11,10, 9*
++ &or ($s3,$s1); # 13,12, 7, 6
++ &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
++ &mov ($s1,$v1); # s[1]=t[1]
++
++ &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
++ &shr ($s2,16); # -, -, 8,11
++ &mov ($v1,&DWP(2,$te,$v0,8)); # 2
++ &movz ($v0,&HB($s3)); # 13,12, 7*, 6
++ &xor ($v1,&DWP(1,$te,$v0,8)); # 7
++ &movz ($v0,&HB($s2)); # -, -, 8*,11
++ &xor ($v1,&DWP(0,$te,$v0,8)); # 8
++ &mov ($v0,$s3);
++ &shr ($v0,24); # 13
++ &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
++
++ &movz ($v0,&LB($s2)); # -, -, 8,11*
++ &shr ($s0,24); # 1*
++ &mov ($s2,&DWP(1,$te,$v0,8)); # 11
++ &xor ($s2,&DWP(3,$te,$s0,8)); # 1
++ &mov ($s0,$__s0); # s[0]=t[0]
++ &movz ($v0,&LB($s3)); # 13,12, 7, 6*
++ &shr ($s3,16); # , ,13,12
++ &xor ($s2,&DWP(2,$te,$v0,8)); # 6
++ &mov ($key,$__key); # reincarnate v0 as key
++ &and ($s3,0xff); # , ,13,12*
++ &mov ($s3,&DWP(0,$te,$s3,8)); # 12
++ &xor ($s3,$s2); # s[2]=t[3] collected
++ &mov ($s2,$v1); # s[2]=t[2]
++}
++
++# More experimental code... SSE one... Even though this one eliminates
++# *all* references to stack, it's not faster...
++sub sse_encbody()
++{
++ &movz ($acc,&LB("eax")); # 0
++ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
++ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
++ &movz ("edx",&HB("eax")); # 1
++ &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
++ &shr ("eax",16); # 5, 4
++
++ &movz ($acc,&LB("ebx")); # 10
++ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
++ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
++ &movz ($acc,&HB("ebx")); # 11
++ &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
++ &shr ("ebx",16); # 15,14
++
++ &movz ($acc,&HB("eax")); # 5
++ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
++ &movq ("mm3",QWP(16,$key));
++ &movz ($acc,&HB("ebx")); # 15
++ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
++ &movd ("mm0","ecx"); # t[0] collected
++
++ &movz ($acc,&LB("eax")); # 4
++ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
++ &movd ("eax","mm2"); # 7, 6, 3, 2
++ &movz ($acc,&LB("ebx")); # 14
++ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
++ &movd ("ebx","mm6"); # 13,12, 9, 8
++
++ &movz ($acc,&HB("eax")); # 3
++ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
++ &movz ($acc,&HB("ebx")); # 9
++ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
++ &movd ("mm1","ecx"); # t[1] collected
++
++ &movz ($acc,&LB("eax")); # 2
++ &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
++ &shr ("eax",16); # 7, 6
++ &punpckldq ("mm0","mm1"); # t[0,1] collected
++ &movz ($acc,&LB("ebx")); # 8
++ &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
++ &shr ("ebx",16); # 13,12
++
++ &movz ($acc,&HB("eax")); # 7
++ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
++ &pxor ("mm0","mm3");
++ &movz ("eax",&LB("eax")); # 6
++ &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
++ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
++ &movz ($acc,&HB("ebx")); # 13
++ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
++ &xor ("ecx",&DWP(24,$key)); # t[2]
++ &movd ("mm4","ecx"); # t[2] collected
++ &movz ("ebx",&LB("ebx")); # 12
++ &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
++ &shr ("ecx",16);
++ &movd ("eax","mm1"); # 5, 4, 1, 0
++ &mov ("ebx",&DWP(28,$key)); # t[3]
++ &xor ("ebx","edx");
++ &movd ("mm5","ebx"); # t[3] collected
++ &and ("ebx",0xffff0000);
++ &or ("ebx","ecx");
++
++ &punpckldq ("mm4","mm5"); # t[2,3] collected
++}
++
++######################################################################
++# "Compact" block function
++######################################################################
++
++sub enccompact()
++{ my $Fn = \&mov;
++ while ($#_>5) { pop(@_); $Fn=sub{}; }
++ my ($i,$te,@s)=@_;
++ my $tmp = $key;
++ my $out = $i==3?$s[0]:$acc;
++
++ # $Fn is used in first compact round and its purpose is to
++ # void restoration of some values from stack, so that after
++ # 4xenccompact with extra argument $key value is left there...
++ if ($i==3) { &$Fn ($key,$__key); }##%edx
++ else { &mov ($out,$s[0]); }
++ &and ($out,0xFF);
++ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
++ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
++ &movz ($out,&BP(-128,$te,$out,1));
++
++ if ($i==3) { $tmp=$s[1]; }##%eax
++ &movz ($tmp,&HB($s[1]));
++ &movz ($tmp,&BP(-128,$te,$tmp,1));
++ &shl ($tmp,8);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
++ else { &mov ($tmp,$s[2]);
++ &shr ($tmp,16); }
++ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
++ &and ($tmp,0xFF);
++ &movz ($tmp,&BP(-128,$te,$tmp,1));
++ &shl ($tmp,16);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
++ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
++ else { &mov ($tmp,$s[3]);
++ &shr ($tmp,24); }
++ &movz ($tmp,&BP(-128,$te,$tmp,1));
++ &shl ($tmp,24);
++ &xor ($out,$tmp);
++ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
++ if ($i==3) { &mov ($s[3],$acc); }
++ &comment();
++}
++
++sub enctransform()
++{ my @s = ($s0,$s1,$s2,$s3);
++ my $i = shift;
++ my $tmp = $tbl;
++ my $r2 = $key ;
++
++ &and ($tmp,$s[$i]);
++ &lea ($r2,&DWP(0,$s[$i],$s[$i]));
++ &mov ($acc,$tmp);
++ &shr ($tmp,7);
++ &and ($r2,0xfefefefe);
++ &sub ($acc,$tmp);
++ &mov ($tmp,$s[$i]);
++ &and ($acc,0x1b1b1b1b);
++ &rotr ($tmp,16);
++ &xor ($acc,$r2); # r2
++ &mov ($r2,$s[$i]);
++
++ &xor ($s[$i],$acc); # r0 ^ r2
++ &rotr ($r2,16+8);
++ &xor ($acc,$tmp);
++ &rotl ($s[$i],24);
++ &xor ($acc,$r2);
++ &mov ($tmp,0x80808080) if ($i!=1);
++ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
++}
++
++&function_begin_B("_x86_AES_encrypt_compact");
++ # note that caller is expected to allocate stack frame for me!
++ &mov ($__key,$key); # save key
++
++ &xor ($s0,&DWP(0,$key)); # xor with key
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &mov ($acc,&DWP(240,$key)); # load key->rounds
++ &lea ($acc,&DWP(-2,$acc,$acc));
++ &lea ($acc,&DWP(0,$key,$acc,8));
++ &mov ($__end,$acc); # end of key schedule
++
++ # prefetch Te4
++ &mov ($key,&DWP(0-128,$tbl));
++ &mov ($acc,&DWP(32-128,$tbl));
++ &mov ($key,&DWP(64-128,$tbl));
++ &mov ($acc,&DWP(96-128,$tbl));
++ &mov ($key,&DWP(128-128,$tbl));
++ &mov ($acc,&DWP(160-128,$tbl));
++ &mov ($key,&DWP(192-128,$tbl));
++ &mov ($acc,&DWP(224-128,$tbl));
++
++ &set_label("loop",16);
++
++ &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
++ &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
++ &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
++ &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
++ &mov ($tbl,0x80808080);
++ &enctransform(2);
++ &enctransform(3);
++ &enctransform(0);
++ &enctransform(1);
++ &mov ($key,$__key);
++ &mov ($tbl,$__tbl);
++ &add ($key,16); # advance rd_key
++ &xor ($s0,&DWP(0,$key));
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &cmp ($key,$__end);
++ &mov ($__key,$key);
++ &jb (&label("loop"));
++
++ &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
++ &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
++ &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
++ &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
++
++ &xor ($s0,&DWP(16,$key));
++ &xor ($s1,&DWP(20,$key));
++ &xor ($s2,&DWP(24,$key));
++ &xor ($s3,&DWP(28,$key));
++
++ &ret ();
++&function_end_B("_x86_AES_encrypt_compact");
++
++######################################################################
++# "Compact" SSE block function.
++######################################################################
++#
++# Performance is not actually extraordinary in comparison to pure
++# x86 code. In particular encrypt performance is virtually the same.
++# Decrypt performance on the other hand is 15-20% better on newer
++# µ-archs [but we're thankful for *any* improvement here], and ~50%
++# better on PIII:-) And additionally on the pros side this code
++# eliminates redundant references to stack and thus relieves/
++# minimizes the pressure on the memory bus.
++#
++# MMX register layout lsb
++# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
++# | mm4 | mm0 |
++# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
++# | s3 | s2 | s1 | s0 |
++# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
++# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
++# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
++#
++# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
++# In this terms encryption and decryption "compact" permutation
++# matrices can be depicted as following:
++#
++# encryption lsb # decryption lsb
++# +----++----+----+----+----+ # +----++----+----+----+----+
++# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
++# +----++----+----+----+----+ # +----++----+----+----+----+
++# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
++# +----++----+----+----+----+ # +----++----+----+----+----+
++# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
++# +----++----+----+----+----+ # +----++----+----+----+----+
++# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
++# +----++----+----+----+----+ # +----++----+----+----+----+
++#
++######################################################################
++# Why not xmm registers? Short answer. It was actually tested and
++# was not any faster, but *contrary*, most notably on Intel CPUs.
++# Longer answer. Main advantage of using mm registers is that movd
++# latency is lower, especially on Intel P4. While arithmetic
++# instructions are twice as many, they can be scheduled every cycle
++# and not every second one when they are operating on xmm register,
++# so that "arithmetic throughput" remains virtually the same. And
++# finally the code can be executed even on elder SSE-only CPUs:-)
++
++sub sse_enccompact()
++{
++ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
++ &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
++ &movd ("eax","mm1"); # 5, 4, 1, 0
++ &movd ("ebx","mm5"); # 15,14,11,10
++ &mov ($__key,$key);
++
++ &movz ($acc,&LB("eax")); # 0
++ &movz ("edx",&HB("eax")); # 1
++ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
++ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
++ &movz ($key,&LB("ebx")); # 10
++ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
++ &shr ("eax",16); # 5, 4
++ &shl ("edx",8); # 1
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
++ &movz ($key,&HB("ebx")); # 11
++ &shl ($acc,16); # 10
++ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
++ &or ("ecx",$acc); # 10
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
++ &movz ($key,&HB("eax")); # 5
++ &shl ($acc,24); # 11
++ &shr ("ebx",16); # 15,14
++ &or ("edx",$acc); # 11
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
++ &movz ($key,&HB("ebx")); # 15
++ &shl ($acc,8); # 5
++ &or ("ecx",$acc); # 5
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
++ &movz ($key,&LB("eax")); # 4
++ &shl ($acc,24); # 15
++ &or ("ecx",$acc); # 15
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
++ &movz ($key,&LB("ebx")); # 14
++ &movd ("eax","mm2"); # 7, 6, 3, 2
++ &movd ("mm0","ecx"); # t[0] collected
++ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
++ &movz ($key,&HB("eax")); # 3
++ &shl ("ecx",16); # 14
++ &movd ("ebx","mm6"); # 13,12, 9, 8
++ &or ("ecx",$acc); # 14
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
++ &movz ($key,&HB("ebx")); # 9
++ &shl ($acc,24); # 3
++ &or ("ecx",$acc); # 3
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
++ &movz ($key,&LB("ebx")); # 8
++ &shl ($acc,8); # 9
++ &shr ("ebx",16); # 13,12
++ &or ("ecx",$acc); # 9
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
++ &movz ($key,&LB("eax")); # 2
++ &shr ("eax",16); # 7, 6
++ &movd ("mm1","ecx"); # t[1] collected
++ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
++ &movz ($key,&HB("eax")); # 7
++ &shl ("ecx",16); # 2
++ &and ("eax",0xff); # 6
++ &or ("ecx",$acc); # 2
++
++ &punpckldq ("mm0","mm1"); # t[0,1] collected
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
++ &movz ($key,&HB("ebx")); # 13
++ &shl ($acc,24); # 7
++ &and ("ebx",0xff); # 12
++ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
++ &or ("ecx",$acc); # 7
++ &shl ("eax",16); # 6
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
++ &or ("edx","eax"); # 6
++ &shl ($acc,8); # 13
++ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
++ &or ("ecx",$acc); # 13
++ &or ("edx","ebx"); # 12
++ &mov ($key,$__key);
++ &movd ("mm4","ecx"); # t[2] collected
++ &movd ("mm5","edx"); # t[3] collected
++
++ &punpckldq ("mm4","mm5"); # t[2,3] collected
++}
++
++ if (!$x86only) {
++&function_begin_B("_sse_AES_encrypt_compact");
++ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
++ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
++
++ # note that caller is expected to allocate stack frame for me!
++ &mov ($acc,&DWP(240,$key)); # load key->rounds
++ &lea ($acc,&DWP(-2,$acc,$acc));
++ &lea ($acc,&DWP(0,$key,$acc,8));
++ &mov ($__end,$acc); # end of key schedule
++
++ &mov ($s0,0x1b1b1b1b); # magic constant
++ &mov (&DWP(8,"esp"),$s0);
++ &mov (&DWP(12,"esp"),$s0);
++
++ # prefetch Te4
++ &mov ($s0,&DWP(0-128,$tbl));
++ &mov ($s1,&DWP(32-128,$tbl));
++ &mov ($s2,&DWP(64-128,$tbl));
++ &mov ($s3,&DWP(96-128,$tbl));
++ &mov ($s0,&DWP(128-128,$tbl));
++ &mov ($s1,&DWP(160-128,$tbl));
++ &mov ($s2,&DWP(192-128,$tbl));
++ &mov ($s3,&DWP(224-128,$tbl));
++
++ &set_label("loop",16);
++ &sse_enccompact();
++ &add ($key,16);
++ &cmp ($key,$__end);
++ &ja (&label("out"));
++
++ &movq ("mm2",&QWP(8,"esp"));
++ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
++ &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
++ &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
++ &pand ("mm3","mm2"); &pand ("mm7","mm2");
++ &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
++ &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
++ &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
++ &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
++
++ &movq ("mm2","mm3"); &movq ("mm6","mm7");
++ &pslld ("mm3",8); &pslld ("mm7",8);
++ &psrld ("mm2",24); &psrld ("mm6",24);
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
++
++ &movq ("mm3","mm1"); &movq ("mm7","mm5");
++ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
++ &psrld ("mm1",8); &psrld ("mm5",8);
++ &mov ($s0,&DWP(0-128,$tbl));
++ &pslld ("mm3",24); &pslld ("mm7",24);
++ &mov ($s1,&DWP(64-128,$tbl));
++ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
++ &mov ($s2,&DWP(128-128,$tbl));
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
++ &mov ($s3,&DWP(192-128,$tbl));
++
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
++ &jmp (&label("loop"));
++
++ &set_label("out",16);
++ &pxor ("mm0",&QWP(0,$key));
++ &pxor ("mm4",&QWP(8,$key));
++
++ &ret ();
++&function_end_B("_sse_AES_encrypt_compact");
++ }
++
++######################################################################
++# Vanilla block function.
++######################################################################
++
++sub encstep()
++{ my ($i,$te,@s) = @_;
++ my $tmp = $key;
++ my $out = $i==3?$s[0]:$acc;
++
++ # lines marked with #%e?x[i] denote "reordered" instructions...
++ if ($i==3) { &mov ($key,$__key); }##%edx
++ else { &mov ($out,$s[0]);
++ &and ($out,0xFF); }
++ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
++ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
++ &mov ($out,&DWP(0,$te,$out,8));
++
++ if ($i==3) { $tmp=$s[1]; }##%eax
++ &movz ($tmp,&HB($s[1]));
++ &xor ($out,&DWP(3,$te,$tmp,8));
++
++ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
++ else { &mov ($tmp,$s[2]);
++ &shr ($tmp,16); }
++ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
++ &and ($tmp,0xFF);
++ &xor ($out,&DWP(2,$te,$tmp,8));
++
++ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
++ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
++ else { &mov ($tmp,$s[3]);
++ &shr ($tmp,24) }
++ &xor ($out,&DWP(1,$te,$tmp,8));
++ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
++ if ($i==3) { &mov ($s[3],$acc); }
++ &comment();
++}
++
++sub enclast()
++{ my ($i,$te,@s)=@_;
++ my $tmp = $key;
++ my $out = $i==3?$s[0]:$acc;
++
++ if ($i==3) { &mov ($key,$__key); }##%edx
++ else { &mov ($out,$s[0]); }
++ &and ($out,0xFF);
++ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
++ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
++ &mov ($out,&DWP(2,$te,$out,8));
++ &and ($out,0x000000ff);
++
++ if ($i==3) { $tmp=$s[1]; }##%eax
++ &movz ($tmp,&HB($s[1]));
++ &mov ($tmp,&DWP(0,$te,$tmp,8));
++ &and ($tmp,0x0000ff00);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
++ else { &mov ($tmp,$s[2]);
++ &shr ($tmp,16); }
++ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
++ &and ($tmp,0xFF);
++ &mov ($tmp,&DWP(0,$te,$tmp,8));
++ &and ($tmp,0x00ff0000);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
++ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
++ else { &mov ($tmp,$s[3]);
++ &shr ($tmp,24); }
++ &mov ($tmp,&DWP(2,$te,$tmp,8));
++ &and ($tmp,0xff000000);
++ &xor ($out,$tmp);
++ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
++ if ($i==3) { &mov ($s[3],$acc); }
++}
++
++&function_begin_B("_x86_AES_encrypt");
++ if ($vertical_spin) {
++ # I need high parts of volatile registers to be accessible...
++ &exch ($s1="edi",$key="ebx");
++ &mov ($s2="esi",$acc="ecx");
++ }
++
++ # note that caller is expected to allocate stack frame for me!
++ &mov ($__key,$key); # save key
++
++ &xor ($s0,&DWP(0,$key)); # xor with key
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &mov ($acc,&DWP(240,$key)); # load key->rounds
++
++ if ($small_footprint) {
++ &lea ($acc,&DWP(-2,$acc,$acc));
++ &lea ($acc,&DWP(0,$key,$acc,8));
++ &mov ($__end,$acc); # end of key schedule
++
++ &set_label("loop",16);
++ if ($vertical_spin) {
++ &encvert($tbl,$s0,$s1,$s2,$s3);
++ } else {
++ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
++ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
++ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
++ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
++ }
++ &add ($key,16); # advance rd_key
++ &xor ($s0,&DWP(0,$key));
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++ &cmp ($key,$__end);
++ &mov ($__key,$key);
++ &jb (&label("loop"));
++ }
++ else {
++ &cmp ($acc,10);
++ &jle (&label("10rounds"));
++ &cmp ($acc,12);
++ &jle (&label("12rounds"));
++
++ &set_label("14rounds",4);
++ for ($i=1;$i<3;$i++) {
++ if ($vertical_spin) {
++ &encvert($tbl,$s0,$s1,$s2,$s3);
++ } else {
++ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
++ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
++ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
++ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
++ }
++ &xor ($s0,&DWP(16*$i+0,$key));
++ &xor ($s1,&DWP(16*$i+4,$key));
++ &xor ($s2,&DWP(16*$i+8,$key));
++ &xor ($s3,&DWP(16*$i+12,$key));
++ }
++ &add ($key,32);
++ &mov ($__key,$key); # advance rd_key
++ &set_label("12rounds",4);
++ for ($i=1;$i<3;$i++) {
++ if ($vertical_spin) {
++ &encvert($tbl,$s0,$s1,$s2,$s3);
++ } else {
++ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
++ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
++ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
++ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
++ }
++ &xor ($s0,&DWP(16*$i+0,$key));
++ &xor ($s1,&DWP(16*$i+4,$key));
++ &xor ($s2,&DWP(16*$i+8,$key));
++ &xor ($s3,&DWP(16*$i+12,$key));
++ }
++ &add ($key,32);
++ &mov ($__key,$key); # advance rd_key
++ &set_label("10rounds",4);
++ for ($i=1;$i<10;$i++) {
++ if ($vertical_spin) {
++ &encvert($tbl,$s0,$s1,$s2,$s3);
++ } else {
++ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
++ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
++ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
++ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
++ }
++ &xor ($s0,&DWP(16*$i+0,$key));
++ &xor ($s1,&DWP(16*$i+4,$key));
++ &xor ($s2,&DWP(16*$i+8,$key));
++ &xor ($s3,&DWP(16*$i+12,$key));
++ }
++ }
++
++ if ($vertical_spin) {
++ # "reincarnate" some registers for "horizontal" spin...
++ &mov ($s1="ebx",$key="edi");
++ &mov ($s2="ecx",$acc="esi");
++ }
++ &enclast(0,$tbl,$s0,$s1,$s2,$s3);
++ &enclast(1,$tbl,$s1,$s2,$s3,$s0);
++ &enclast(2,$tbl,$s2,$s3,$s0,$s1);
++ &enclast(3,$tbl,$s3,$s0,$s1,$s2);
++
++ &add ($key,$small_footprint?16:160);
++ &xor ($s0,&DWP(0,$key));
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &ret ();
++
++&set_label("AES_Te",64); # Yes! I keep it in the code segment!
++ &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
++ &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
++ &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
++ &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
++ &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
++ &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
++ &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
++ &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
++ &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
++ &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
++ &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
++ &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
++ &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
++ &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
++ &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
++ &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
++ &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
++ &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
++ &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
++ &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
++ &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
++ &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
++ &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
++ &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
++ &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
++ &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
++ &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
++ &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
++ &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
++ &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
++ &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
++ &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
++ &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
++ &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
++ &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
++ &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
++ &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
++ &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
++ &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
++ &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
++ &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
++ &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
++ &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
++ &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
++ &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
++ &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
++ &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
++ &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
++ &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
++ &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
++ &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
++ &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
++ &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
++ &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
++ &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
++ &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
++ &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
++ &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
++ &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
++ &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
++ &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
++ &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
++ &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
++ &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
++
++#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++#rcon:
++ &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
++ &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
++ &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
++ &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
++&function_end_B("_x86_AES_encrypt");
++
++# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
++&function_begin("AES_encrypt");
++ &mov ($acc,&wparam(0)); # load inp
++ &mov ($key,&wparam(2)); # load key
++
++ &mov ($s0,"esp");
++ &sub ("esp",36);
++ &and ("esp",-64); # align to cache-line
++
++ # place stack frame just "above" the key schedule
++ &lea ($s1,&DWP(-64-63,$key));
++ &sub ($s1,"esp");
++ &neg ($s1);
++ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
++ &sub ("esp",$s1);
++ &add ("esp",4); # 4 is reserved for caller's return address
++ &mov ($_esp,$s0); # save stack pointer
++
++ &call (&label("pic_point")); # make it PIC!
++ &set_label("pic_point");
++ &blindpop($tbl);
++ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
++ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
++
++ # pick Te4 copy which can't "overlap" with stack frame or key schedule
++ &lea ($s1,&DWP(768-4,"esp"));
++ &sub ($s1,$tbl);
++ &and ($s1,0x300);
++ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
++
++ if (!$x86only) {
++ &bt (&DWP(0,$s0),25); # check for SSE bit
++ &jnc (&label("x86"));
++
++ &movq ("mm0",&QWP(0,$acc));
++ &movq ("mm4",&QWP(8,$acc));
++ &call ("_sse_AES_encrypt_compact");
++ &mov ("esp",$_esp); # restore stack pointer
++ &mov ($acc,&wparam(1)); # load out
++ &movq (&QWP(0,$acc),"mm0"); # write output data
++ &movq (&QWP(8,$acc),"mm4");
++ &emms ();
++ &function_end_A();
++ }
++ &set_label("x86",16);
++ &mov ($_tbl,$tbl);
++ &mov ($s0,&DWP(0,$acc)); # load input data
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++ &call ("_x86_AES_encrypt_compact");
++ &mov ("esp",$_esp); # restore stack pointer
++ &mov ($acc,&wparam(1)); # load out
++ &mov (&DWP(0,$acc),$s0); # write output data
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++&function_end("AES_encrypt");
++
++#--------------------------------------------------------------------#
++
++######################################################################
++# "Compact" block function
++######################################################################
++
++sub deccompact()
++{ my $Fn = \&mov;
++ while ($#_>5) { pop(@_); $Fn=sub{}; }
++ my ($i,$td,@s)=@_;
++ my $tmp = $key;
++ my $out = $i==3?$s[0]:$acc;
++
++ # $Fn is used in first compact round and its purpose is to
++ # void restoration of some values from stack, so that after
++ # 4xdeccompact with extra argument $key, $s0 and $s1 values
++ # are left there...
++ if($i==3) { &$Fn ($key,$__key); }
++ else { &mov ($out,$s[0]); }
++ &and ($out,0xFF);
++ &movz ($out,&BP(-128,$td,$out,1));
++
++ if ($i==3) { $tmp=$s[1]; }
++ &movz ($tmp,&HB($s[1]));
++ &movz ($tmp,&BP(-128,$td,$tmp,1));
++ &shl ($tmp,8);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
++ else { mov ($tmp,$s[2]); }
++ &shr ($tmp,16);
++ &and ($tmp,0xFF);
++ &movz ($tmp,&BP(-128,$td,$tmp,1));
++ &shl ($tmp,16);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
++ else { &mov ($tmp,$s[3]); }
++ &shr ($tmp,24);
++ &movz ($tmp,&BP(-128,$td,$tmp,1));
++ &shl ($tmp,24);
++ &xor ($out,$tmp);
++ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
++ if ($i==3) { &$Fn ($s[3],$__s0); }
++}
++
++# must be called with 2,3,0,1 as argument sequence!!!
++sub dectransform()
++{ my @s = ($s0,$s1,$s2,$s3);
++ my $i = shift;
++ my $tmp = $key;
++ my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
++ my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
++ my $tp8 = $tbl;
++
++ &mov ($tmp,0x80808080);
++ &and ($tmp,$s[$i]);
++ &mov ($acc,$tmp);
++ &shr ($tmp,7);
++ &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
++ &sub ($acc,$tmp);
++ &and ($tp2,0xfefefefe);
++ &and ($acc,0x1b1b1b1b);
++ &xor ($tp2,$acc);
++ &mov ($tmp,0x80808080);
++
++ &and ($tmp,$tp2);
++ &mov ($acc,$tmp);
++ &shr ($tmp,7);
++ &lea ($tp4,&DWP(0,$tp2,$tp2));
++ &sub ($acc,$tmp);
++ &and ($tp4,0xfefefefe);
++ &and ($acc,0x1b1b1b1b);
++ &xor ($tp2,$s[$i]); # tp2^tp1
++ &xor ($tp4,$acc);
++ &mov ($tmp,0x80808080);
++
++ &and ($tmp,$tp4);
++ &mov ($acc,$tmp);
++ &shr ($tmp,7);
++ &lea ($tp8,&DWP(0,$tp4,$tp4));
++ &sub ($acc,$tmp);
++ &and ($tp8,0xfefefefe);
++ &and ($acc,0x1b1b1b1b);
++ &xor ($tp4,$s[$i]); # tp4^tp1
++ &rotl ($s[$i],8); # = ROTATE(tp1,8)
++ &xor ($tp8,$acc);
++
++ &xor ($s[$i],$tp2);
++ &xor ($tp2,$tp8);
++ &xor ($s[$i],$tp4);
++ &xor ($tp4,$tp8);
++ &rotl ($tp2,24);
++ &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
++ &rotl ($tp4,16);
++ &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
++ &rotl ($tp8,8);
++ &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
++ &mov ($s[0],$__s0) if($i==2); #prefetch $s0
++ &mov ($s[1],$__s1) if($i==3); #prefetch $s1
++ &mov ($s[2],$__s2) if($i==1);
++ &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
++
++ &mov ($s[3],$__s3) if($i==1);
++ &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
++}
++
++&function_begin_B("_x86_AES_decrypt_compact");
++ # note that caller is expected to allocate stack frame for me!
++ &mov ($__key,$key); # save key
++
++ &xor ($s0,&DWP(0,$key)); # xor with key
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &mov ($acc,&DWP(240,$key)); # load key->rounds
++
++ &lea ($acc,&DWP(-2,$acc,$acc));
++ &lea ($acc,&DWP(0,$key,$acc,8));
++ &mov ($__end,$acc); # end of key schedule
++
++ # prefetch Td4
++ &mov ($key,&DWP(0-128,$tbl));
++ &mov ($acc,&DWP(32-128,$tbl));
++ &mov ($key,&DWP(64-128,$tbl));
++ &mov ($acc,&DWP(96-128,$tbl));
++ &mov ($key,&DWP(128-128,$tbl));
++ &mov ($acc,&DWP(160-128,$tbl));
++ &mov ($key,&DWP(192-128,$tbl));
++ &mov ($acc,&DWP(224-128,$tbl));
++
++ &set_label("loop",16);
++
++ &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
++ &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
++ &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
++ &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
++ &dectransform(2);
++ &dectransform(3);
++ &dectransform(0);
++ &dectransform(1);
++ &mov ($key,$__key);
++ &mov ($tbl,$__tbl);
++ &add ($key,16); # advance rd_key
++ &xor ($s0,&DWP(0,$key));
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &cmp ($key,$__end);
++ &mov ($__key,$key);
++ &jb (&label("loop"));
++
++ &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
++ &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
++ &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
++ &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
++
++ &xor ($s0,&DWP(16,$key));
++ &xor ($s1,&DWP(20,$key));
++ &xor ($s2,&DWP(24,$key));
++ &xor ($s3,&DWP(28,$key));
++
++ &ret ();
++&function_end_B("_x86_AES_decrypt_compact");
++
++######################################################################
++# "Compact" SSE block function.
++######################################################################
++
++sub sse_deccompact()
++{
++ &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
++ &pshufw ("mm5","mm4",0x09); # 13,12,11,10
++ &movd ("eax","mm1"); # 7, 6, 1, 0
++ &movd ("ebx","mm5"); # 13,12,11,10
++ &mov ($__key,$key);
++
++ &movz ($acc,&LB("eax")); # 0
++ &movz ("edx",&HB("eax")); # 1
++ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
++ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
++ &movz ($key,&LB("ebx")); # 10
++ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
++ &shr ("eax",16); # 7, 6
++ &shl ("edx",8); # 1
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
++ &movz ($key,&HB("ebx")); # 11
++ &shl ($acc,16); # 10
++ &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
++ &or ("ecx",$acc); # 10
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
++ &movz ($key,&HB("eax")); # 7
++ &shl ($acc,24); # 11
++ &shr ("ebx",16); # 13,12
++ &or ("edx",$acc); # 11
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
++ &movz ($key,&HB("ebx")); # 13
++ &shl ($acc,24); # 7
++ &or ("ecx",$acc); # 7
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
++ &movz ($key,&LB("eax")); # 6
++ &shl ($acc,8); # 13
++ &movd ("eax","mm2"); # 3, 2, 5, 4
++ &or ("ecx",$acc); # 13
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
++ &movz ($key,&LB("ebx")); # 12
++ &shl ($acc,16); # 6
++ &movd ("ebx","mm6"); # 9, 8,15,14
++ &movd ("mm0","ecx"); # t[0] collected
++ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
++ &movz ($key,&LB("eax")); # 4
++ &or ("ecx",$acc); # 12
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
++ &movz ($key,&LB("ebx")); # 14
++ &or ("edx",$acc); # 4
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
++ &movz ($key,&HB("eax")); # 5
++ &shl ($acc,16); # 14
++ &shr ("eax",16); # 3, 2
++ &or ("edx",$acc); # 14
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
++ &movz ($key,&HB("ebx")); # 15
++ &shr ("ebx",16); # 9, 8
++ &shl ($acc,8); # 5
++ &movd ("mm1","edx"); # t[1] collected
++ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
++ &movz ($key,&HB("ebx")); # 9
++ &shl ("edx",24); # 15
++ &and ("ebx",0xff); # 8
++ &or ("edx",$acc); # 15
++
++ &punpckldq ("mm0","mm1"); # t[0,1] collected
++
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
++ &movz ($key,&LB("eax")); # 2
++ &shl ($acc,8); # 9
++ &movz ("eax",&HB("eax")); # 3
++ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
++ &or ("ecx",$acc); # 9
++ &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
++ &or ("edx","ebx"); # 8
++ &shl ($acc,16); # 2
++ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
++ &or ("edx",$acc); # 2
++ &shl ("eax",24); # 3
++ &or ("ecx","eax"); # 3
++ &mov ($key,$__key);
++ &movd ("mm4","edx"); # t[2] collected
++ &movd ("mm5","ecx"); # t[3] collected
++
++ &punpckldq ("mm4","mm5"); # t[2,3] collected
++}
++
++ if (!$x86only) {
++&function_begin_B("_sse_AES_decrypt_compact");
++ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
++ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
++
++ # note that caller is expected to allocate stack frame for me!
++ &mov ($acc,&DWP(240,$key)); # load key->rounds
++ &lea ($acc,&DWP(-2,$acc,$acc));
++ &lea ($acc,&DWP(0,$key,$acc,8));
++ &mov ($__end,$acc); # end of key schedule
++
++ &mov ($s0,0x1b1b1b1b); # magic constant
++ &mov (&DWP(8,"esp"),$s0);
++ &mov (&DWP(12,"esp"),$s0);
++
++ # prefetch Td4
++ &mov ($s0,&DWP(0-128,$tbl));
++ &mov ($s1,&DWP(32-128,$tbl));
++ &mov ($s2,&DWP(64-128,$tbl));
++ &mov ($s3,&DWP(96-128,$tbl));
++ &mov ($s0,&DWP(128-128,$tbl));
++ &mov ($s1,&DWP(160-128,$tbl));
++ &mov ($s2,&DWP(192-128,$tbl));
++ &mov ($s3,&DWP(224-128,$tbl));
++
++ &set_label("loop",16);
++ &sse_deccompact();
++ &add ($key,16);
++ &cmp ($key,$__end);
++ &ja (&label("out"));
++
++ # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
++ &movq ("mm3","mm0"); &movq ("mm7","mm4");
++ &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
++ &movq ("mm1","mm0"); &movq ("mm5","mm4");
++ &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
++ &pslld ("mm2",8); &pslld ("mm6",8);
++ &psrld ("mm3",8); &psrld ("mm7",8);
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
++ &pslld ("mm2",16); &pslld ("mm6",16);
++ &psrld ("mm3",16); &psrld ("mm7",16);
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
++
++ &movq ("mm3",&QWP(8,"esp"));
++ &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
++ &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
++ &pand ("mm2","mm3"); &pand ("mm6","mm3");
++ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
++ &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
++ &movq ("mm3","mm1"); &movq ("mm7","mm5");
++ &movq ("mm2","mm1"); &movq ("mm6","mm5");
++ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
++ &pslld ("mm3",24); &pslld ("mm7",24);
++ &psrld ("mm2",8); &psrld ("mm6",8);
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
++
++ &movq ("mm2",&QWP(8,"esp"));
++ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
++ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
++ &pand ("mm3","mm2"); &pand ("mm7","mm2");
++ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
++ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
++ &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
++ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
++
++ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
++ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
++ &pand ("mm3","mm2"); &pand ("mm7","mm2");
++ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
++ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
++ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
++ &movq ("mm3","mm1"); &movq ("mm7","mm5");
++ &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
++ &pslld ("mm1",8); &pslld ("mm5",8);
++ &psrld ("mm3",8); &psrld ("mm7",8);
++ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
++ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
++ &mov ($s0,&DWP(0-128,$tbl));
++ &pslld ("mm1",16); &pslld ("mm5",16);
++ &mov ($s1,&DWP(64-128,$tbl));
++ &psrld ("mm3",16); &psrld ("mm7",16);
++ &mov ($s2,&DWP(128-128,$tbl));
++ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
++ &mov ($s3,&DWP(192-128,$tbl));
++ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
++
++ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
++ &jmp (&label("loop"));
++
++ &set_label("out",16);
++ &pxor ("mm0",&QWP(0,$key));
++ &pxor ("mm4",&QWP(8,$key));
++
++ &ret ();
++&function_end_B("_sse_AES_decrypt_compact");
++ }
++
++######################################################################
++# Vanilla block function.
++######################################################################
++
++sub decstep()
++{ my ($i,$td,@s) = @_;
++ my $tmp = $key;
++ my $out = $i==3?$s[0]:$acc;
++
++ # no instructions are reordered, as performance appears
++ # optimal... or rather that all attempts to reorder didn't
++ # result in better performance [which by the way is not a
++ # bit lower than encryption].
++ if($i==3) { &mov ($key,$__key); }
++ else { &mov ($out,$s[0]); }
++ &and ($out,0xFF);
++ &mov ($out,&DWP(0,$td,$out,8));
++
++ if ($i==3) { $tmp=$s[1]; }
++ &movz ($tmp,&HB($s[1]));
++ &xor ($out,&DWP(3,$td,$tmp,8));
++
++ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
++ else { &mov ($tmp,$s[2]); }
++ &shr ($tmp,16);
++ &and ($tmp,0xFF);
++ &xor ($out,&DWP(2,$td,$tmp,8));
++
++ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
++ else { &mov ($tmp,$s[3]); }
++ &shr ($tmp,24);
++ &xor ($out,&DWP(1,$td,$tmp,8));
++ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
++ if ($i==3) { &mov ($s[3],$__s0); }
++ &comment();
++}
++
++sub declast()
++{ my ($i,$td,@s)=@_;
++ my $tmp = $key;
++ my $out = $i==3?$s[0]:$acc;
++
++ if($i==0) { &lea ($td,&DWP(2048+128,$td));
++ &mov ($tmp,&DWP(0-128,$td));
++ &mov ($acc,&DWP(32-128,$td));
++ &mov ($tmp,&DWP(64-128,$td));
++ &mov ($acc,&DWP(96-128,$td));
++ &mov ($tmp,&DWP(128-128,$td));
++ &mov ($acc,&DWP(160-128,$td));
++ &mov ($tmp,&DWP(192-128,$td));
++ &mov ($acc,&DWP(224-128,$td));
++ &lea ($td,&DWP(-128,$td)); }
++ if($i==3) { &mov ($key,$__key); }
++ else { &mov ($out,$s[0]); }
++ &and ($out,0xFF);
++ &movz ($out,&BP(0,$td,$out,1));
++
++ if ($i==3) { $tmp=$s[1]; }
++ &movz ($tmp,&HB($s[1]));
++ &movz ($tmp,&BP(0,$td,$tmp,1));
++ &shl ($tmp,8);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
++ else { mov ($tmp,$s[2]); }
++ &shr ($tmp,16);
++ &and ($tmp,0xFF);
++ &movz ($tmp,&BP(0,$td,$tmp,1));
++ &shl ($tmp,16);
++ &xor ($out,$tmp);
++
++ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
++ else { &mov ($tmp,$s[3]); }
++ &shr ($tmp,24);
++ &movz ($tmp,&BP(0,$td,$tmp,1));
++ &shl ($tmp,24);
++ &xor ($out,$tmp);
++ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
++ if ($i==3) { &mov ($s[3],$__s0);
++ &lea ($td,&DWP(-2048,$td)); }
++}
++
++&function_begin_B("_x86_AES_decrypt");
++ # note that caller is expected to allocate stack frame for me!
++ &mov ($__key,$key); # save key
++
++ &xor ($s0,&DWP(0,$key)); # xor with key
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &mov ($acc,&DWP(240,$key)); # load key->rounds
++
++ if ($small_footprint) {
++ &lea ($acc,&DWP(-2,$acc,$acc));
++ &lea ($acc,&DWP(0,$key,$acc,8));
++ &mov ($__end,$acc); # end of key schedule
++ &set_label("loop",16);
++ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
++ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
++ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
++ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
++ &add ($key,16); # advance rd_key
++ &xor ($s0,&DWP(0,$key));
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++ &cmp ($key,$__end);
++ &mov ($__key,$key);
++ &jb (&label("loop"));
++ }
++ else {
++ &cmp ($acc,10);
++ &jle (&label("10rounds"));
++ &cmp ($acc,12);
++ &jle (&label("12rounds"));
++
++ &set_label("14rounds",4);
++ for ($i=1;$i<3;$i++) {
++ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
++ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
++ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
++ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
++ &xor ($s0,&DWP(16*$i+0,$key));
++ &xor ($s1,&DWP(16*$i+4,$key));
++ &xor ($s2,&DWP(16*$i+8,$key));
++ &xor ($s3,&DWP(16*$i+12,$key));
++ }
++ &add ($key,32);
++ &mov ($__key,$key); # advance rd_key
++ &set_label("12rounds",4);
++ for ($i=1;$i<3;$i++) {
++ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
++ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
++ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
++ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
++ &xor ($s0,&DWP(16*$i+0,$key));
++ &xor ($s1,&DWP(16*$i+4,$key));
++ &xor ($s2,&DWP(16*$i+8,$key));
++ &xor ($s3,&DWP(16*$i+12,$key));
++ }
++ &add ($key,32);
++ &mov ($__key,$key); # advance rd_key
++ &set_label("10rounds",4);
++ for ($i=1;$i<10;$i++) {
++ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
++ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
++ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
++ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
++ &xor ($s0,&DWP(16*$i+0,$key));
++ &xor ($s1,&DWP(16*$i+4,$key));
++ &xor ($s2,&DWP(16*$i+8,$key));
++ &xor ($s3,&DWP(16*$i+12,$key));
++ }
++ }
++
++ &declast(0,$tbl,$s0,$s3,$s2,$s1);
++ &declast(1,$tbl,$s1,$s0,$s3,$s2);
++ &declast(2,$tbl,$s2,$s1,$s0,$s3);
++ &declast(3,$tbl,$s3,$s2,$s1,$s0);
++
++ &add ($key,$small_footprint?16:160);
++ &xor ($s0,&DWP(0,$key));
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &ret ();
++
++&set_label("AES_Td",64); # Yes! I keep it in the code segment!
++ &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
++ &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
++ &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
++ &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
++ &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
++ &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
++ &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
++ &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
++ &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
++ &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
++ &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
++ &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
++ &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
++ &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
++ &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
++ &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
++ &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
++ &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
++ &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
++ &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
++ &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
++ &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
++ &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
++ &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
++ &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
++ &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
++ &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
++ &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
++ &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
++ &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
++ &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
++ &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
++ &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
++ &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
++ &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
++ &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
++ &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
++ &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
++ &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
++ &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
++ &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
++ &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
++ &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
++ &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
++ &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
++ &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
++ &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
++ &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
++ &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
++ &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
++ &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
++ &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
++ &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
++ &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
++ &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
++ &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
++ &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
++ &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
++ &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
++ &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
++ &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
++ &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
++ &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
++ &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
++
++#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++&function_end_B("_x86_AES_decrypt");
++
++# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
++&function_begin("AES_decrypt");
++ &mov ($acc,&wparam(0)); # load inp
++ &mov ($key,&wparam(2)); # load key
++
++ &mov ($s0,"esp");
++ &sub ("esp",36);
++ &and ("esp",-64); # align to cache-line
++
++ # place stack frame just "above" the key schedule
++ &lea ($s1,&DWP(-64-63,$key));
++ &sub ($s1,"esp");
++ &neg ($s1);
++ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
++ &sub ("esp",$s1);
++ &add ("esp",4); # 4 is reserved for caller's return address
++ &mov ($_esp,$s0); # save stack pointer
++
++ &call (&label("pic_point")); # make it PIC!
++ &set_label("pic_point");
++ &blindpop($tbl);
++ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
++ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
++
++ # pick Td4 copy which can't "overlap" with stack frame or key schedule
++ &lea ($s1,&DWP(768-4,"esp"));
++ &sub ($s1,$tbl);
++ &and ($s1,0x300);
++ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
++
++ if (!$x86only) {
++ &bt (&DWP(0,$s0),25); # check for SSE bit
++ &jnc (&label("x86"));
++
++ &movq ("mm0",&QWP(0,$acc));
++ &movq ("mm4",&QWP(8,$acc));
++ &call ("_sse_AES_decrypt_compact");
++ &mov ("esp",$_esp); # restore stack pointer
++ &mov ($acc,&wparam(1)); # load out
++ &movq (&QWP(0,$acc),"mm0"); # write output data
++ &movq (&QWP(8,$acc),"mm4");
++ &emms ();
++ &function_end_A();
++ }
++ &set_label("x86",16);
++ &mov ($_tbl,$tbl);
++ &mov ($s0,&DWP(0,$acc)); # load input data
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++ &call ("_x86_AES_decrypt_compact");
++ &mov ("esp",$_esp); # restore stack pointer
++ &mov ($acc,&wparam(1)); # load out
++ &mov (&DWP(0,$acc),$s0); # write output data
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++&function_end("AES_decrypt");
++
++# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
++# size_t length, const AES_KEY *key,
++# unsigned char *ivp,const int enc);
++{
++# stack frame layout
++# -4(%esp) # return address 0(%esp)
++# 0(%esp) # s0 backing store 4(%esp)
++# 4(%esp) # s1 backing store 8(%esp)
++# 8(%esp) # s2 backing store 12(%esp)
++# 12(%esp) # s3 backing store 16(%esp)
++# 16(%esp) # key backup 20(%esp)
++# 20(%esp) # end of key schedule 24(%esp)
++# 24(%esp) # %ebp backup 28(%esp)
++# 28(%esp) # %esp backup
++my $_inp=&DWP(32,"esp"); # copy of wparam(0)
++my $_out=&DWP(36,"esp"); # copy of wparam(1)
++my $_len=&DWP(40,"esp"); # copy of wparam(2)
++my $_key=&DWP(44,"esp"); # copy of wparam(3)
++my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
++my $_tmp=&DWP(52,"esp"); # volatile variable
++#
++my $ivec=&DWP(60,"esp"); # ivec[16]
++my $aes_key=&DWP(76,"esp"); # copy of aes_key
++my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
++
++&function_begin("AES_cbc_encrypt");
++ &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
++ &cmp ($s2,0);
++ &je (&label("drop_out"));
++
++ &call (&label("pic_point")); # make it PIC!
++ &set_label("pic_point");
++ &blindpop($tbl);
++ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
++
++ &cmp (&wparam(5),0);
++ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
++ &jne (&label("picked_te"));
++ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
++ &set_label("picked_te");
++
++ # one can argue if this is required
++ &pushf ();
++ &cld ();
++
++ &cmp ($s2,$speed_limit);
++ &jb (&label("slow_way"));
++ &test ($s2,15);
++ &jnz (&label("slow_way"));
++ if (!$x86only) {
++ &bt (&DWP(0,$s0),28); # check for hyper-threading bit
++ &jc (&label("slow_way"));
++ }
++ # pre-allocate aligned stack frame...
++ &lea ($acc,&DWP(-80-244,"esp"));
++ &and ($acc,-64);
++
++ # ... and make sure it doesn't alias with $tbl modulo 4096
++ &mov ($s0,$tbl);
++ &lea ($s1,&DWP(2048+256,$tbl));
++ &mov ($s3,$acc);
++ &and ($s0,0xfff); # s = %ebp&0xfff
++ &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
++ &and ($s3,0xfff); # p = %esp&0xfff
++
++ &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
++ &jb (&label("tbl_break_out"));
++ &sub ($s3,$s1);
++ &sub ($acc,$s3);
++ &jmp (&label("tbl_ok"));
++ &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
++ &sub ($s3,$s0);
++ &and ($s3,0xfff);
++ &add ($s3,384);
++ &sub ($acc,$s3);
++ &set_label("tbl_ok",4);
++
++ &lea ($s3,&wparam(0)); # obtain pointer to parameter block
++ &exch ("esp",$acc); # allocate stack frame
++ &add ("esp",4); # reserve for return address!
++ &mov ($_tbl,$tbl); # save %ebp
++ &mov ($_esp,$acc); # save %esp
++
++ &mov ($s0,&DWP(0,$s3)); # load inp
++ &mov ($s1,&DWP(4,$s3)); # load out
++ #&mov ($s2,&DWP(8,$s3)); # load len
++ &mov ($key,&DWP(12,$s3)); # load key
++ &mov ($acc,&DWP(16,$s3)); # load ivp
++ &mov ($s3,&DWP(20,$s3)); # load enc flag
++
++ &mov ($_inp,$s0); # save copy of inp
++ &mov ($_out,$s1); # save copy of out
++ &mov ($_len,$s2); # save copy of len
++ &mov ($_key,$key); # save copy of key
++ &mov ($_ivp,$acc); # save copy of ivp
++
++ &mov ($mark,0); # copy of aes_key->rounds = 0;
++ # do we copy key schedule to stack?
++ &mov ($s1 eq "ebx" ? $s1 : "",$key);
++ &mov ($s2 eq "ecx" ? $s2 : "",244/4);
++ &sub ($s1,$tbl);
++ &mov ("esi",$key);
++ &and ($s1,0xfff);
++ &lea ("edi",$aes_key);
++ &cmp ($s1,2048+256);
++ &jb (&label("do_copy"));
++ &cmp ($s1,4096-244);
++ &jb (&label("skip_copy"));
++ &set_label("do_copy",4);
++ &mov ($_key,"edi");
++ &data_word(0xA5F3F689); # rep movsd
++ &set_label("skip_copy");
++
++ &mov ($key,16);
++ &set_label("prefetch_tbl",4);
++ &mov ($s0,&DWP(0,$tbl));
++ &mov ($s1,&DWP(32,$tbl));
++ &mov ($s2,&DWP(64,$tbl));
++ &mov ($acc,&DWP(96,$tbl));
++ &lea ($tbl,&DWP(128,$tbl));
++ &sub ($key,1);
++ &jnz (&label("prefetch_tbl"));
++ &sub ($tbl,2048);
++
++ &mov ($acc,$_inp);
++ &mov ($key,$_ivp);
++
++ &cmp ($s3,0);
++ &je (&label("fast_decrypt"));
++
++#----------------------------- ENCRYPT -----------------------------#
++ &mov ($s0,&DWP(0,$key)); # load iv
++ &mov ($s1,&DWP(4,$key));
++
++ &set_label("fast_enc_loop",16);
++ &mov ($s2,&DWP(8,$key));
++ &mov ($s3,&DWP(12,$key));
++
++ &xor ($s0,&DWP(0,$acc)); # xor input data
++ &xor ($s1,&DWP(4,$acc));
++ &xor ($s2,&DWP(8,$acc));
++ &xor ($s3,&DWP(12,$acc));
++
++ &mov ($key,$_key); # load key
++ &call ("_x86_AES_encrypt");
++
++ &mov ($acc,$_inp); # load inp
++ &mov ($key,$_out); # load out
++
++ &mov (&DWP(0,$key),$s0); # save output data
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &lea ($acc,&DWP(16,$acc)); # advance inp
++ &mov ($s2,$_len); # load len
++ &mov ($_inp,$acc); # save inp
++ &lea ($s3,&DWP(16,$key)); # advance out
++ &mov ($_out,$s3); # save out
++ &sub ($s2,16); # decrease len
++ &mov ($_len,$s2); # save len
++ &jnz (&label("fast_enc_loop"));
++ &mov ($acc,$_ivp); # load ivp
++ &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
++ &mov ($s3,&DWP(12,$key));
++ &mov (&DWP(0,$acc),$s0); # save ivec
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++
++ &cmp ($mark,0); # was the key schedule copied?
++ &mov ("edi",$_key);
++ &je (&label("skip_ezero"));
++ # zero copy of key schedule
++ &mov ("ecx",240/4);
++ &xor ("eax","eax");
++ &align (4);
++ &data_word(0xABF3F689); # rep stosd
++ &set_label("skip_ezero");
++ &mov ("esp",$_esp);
++ &popf ();
++ &set_label("drop_out");
++ &function_end_A();
++ &pushf (); # kludge, never executed
++
++#----------------------------- DECRYPT -----------------------------#
++&set_label("fast_decrypt",16);
++
++ &cmp ($acc,$_out);
++ &je (&label("fast_dec_in_place")); # in-place processing...
++
++ &mov ($_tmp,$key);
++
++ &align (4);
++ &set_label("fast_dec_loop",16);
++ &mov ($s0,&DWP(0,$acc)); # read input
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++
++ &mov ($key,$_key); # load key
++ &call ("_x86_AES_decrypt");
++
++ &mov ($key,$_tmp); # load ivp
++ &mov ($acc,$_len); # load len
++ &xor ($s0,&DWP(0,$key)); # xor iv
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &mov ($key,$_out); # load out
++ &mov ($acc,$_inp); # load inp
++
++ &mov (&DWP(0,$key),$s0); # write output
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &mov ($s2,$_len); # load len
++ &mov ($_tmp,$acc); # save ivp
++ &lea ($acc,&DWP(16,$acc)); # advance inp
++ &mov ($_inp,$acc); # save inp
++ &lea ($key,&DWP(16,$key)); # advance out
++ &mov ($_out,$key); # save out
++ &sub ($s2,16); # decrease len
++ &mov ($_len,$s2); # save len
++ &jnz (&label("fast_dec_loop"));
++ &mov ($key,$_tmp); # load temp ivp
++ &mov ($acc,$_ivp); # load user ivp
++ &mov ($s0,&DWP(0,$key)); # load iv
++ &mov ($s1,&DWP(4,$key));
++ &mov ($s2,&DWP(8,$key));
++ &mov ($s3,&DWP(12,$key));
++ &mov (&DWP(0,$acc),$s0); # copy back to user
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++ &jmp (&label("fast_dec_out"));
++
++ &set_label("fast_dec_in_place",16);
++ &set_label("fast_dec_in_place_loop");
++ &mov ($s0,&DWP(0,$acc)); # read input
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++
++ &lea ($key,$ivec);
++ &mov (&DWP(0,$key),$s0); # copy to temp
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &mov ($key,$_key); # load key
++ &call ("_x86_AES_decrypt");
++
++ &mov ($key,$_ivp); # load ivp
++ &mov ($acc,$_out); # load out
++ &xor ($s0,&DWP(0,$key)); # xor iv
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &mov (&DWP(0,$acc),$s0); # write output
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++
++ &lea ($acc,&DWP(16,$acc)); # advance out
++ &mov ($_out,$acc); # save out
++
++ &lea ($acc,$ivec);
++ &mov ($s0,&DWP(0,$acc)); # read temp
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++
++ &mov (&DWP(0,$key),$s0); # copy iv
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &mov ($acc,$_inp); # load inp
++ &mov ($s2,$_len); # load len
++ &lea ($acc,&DWP(16,$acc)); # advance inp
++ &mov ($_inp,$acc); # save inp
++ &sub ($s2,16); # decrease len
++ &mov ($_len,$s2); # save len
++ &jnz (&label("fast_dec_in_place_loop"));
++
++ &set_label("fast_dec_out",4);
++ &cmp ($mark,0); # was the key schedule copied?
++ &mov ("edi",$_key);
++ &je (&label("skip_dzero"));
++ # zero copy of key schedule
++ &mov ("ecx",240/4);
++ &xor ("eax","eax");
++ &align (4);
++ &data_word(0xABF3F689); # rep stosd
++ &set_label("skip_dzero");
++ &mov ("esp",$_esp);
++ &popf ();
++ &function_end_A();
++ &pushf (); # kludge, never executed
++
++#--------------------------- SLOW ROUTINE ---------------------------#
++&set_label("slow_way",16);
++
++ &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
++ &mov ($key,&wparam(3)); # load key
++
++ # pre-allocate aligned stack frame...
++ &lea ($acc,&DWP(-80,"esp"));
++ &and ($acc,-64);
++
++ # ... and make sure it doesn't alias with $key modulo 1024
++ &lea ($s1,&DWP(-80-63,$key));
++ &sub ($s1,$acc);
++ &neg ($s1);
++ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
++ &sub ($acc,$s1);
++
++ # pick S-box copy which can't overlap with stack frame or $key
++ &lea ($s1,&DWP(768,$acc));
++ &sub ($s1,$tbl);
++ &and ($s1,0x300);
++ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
++
++ &lea ($s3,&wparam(0)); # pointer to parameter block
++
++ &exch ("esp",$acc);
++ &add ("esp",4); # reserve for return address!
++ &mov ($_tbl,$tbl); # save %ebp
++ &mov ($_esp,$acc); # save %esp
++ &mov ($_tmp,$s0); # save OPENSSL_ia32cap
++
++ &mov ($s0,&DWP(0,$s3)); # load inp
++ &mov ($s1,&DWP(4,$s3)); # load out
++ #&mov ($s2,&DWP(8,$s3)); # load len
++ #&mov ($key,&DWP(12,$s3)); # load key
++ &mov ($acc,&DWP(16,$s3)); # load ivp
++ &mov ($s3,&DWP(20,$s3)); # load enc flag
++
++ &mov ($_inp,$s0); # save copy of inp
++ &mov ($_out,$s1); # save copy of out
++ &mov ($_len,$s2); # save copy of len
++ &mov ($_key,$key); # save copy of key
++ &mov ($_ivp,$acc); # save copy of ivp
++
++ &mov ($key,$acc);
++ &mov ($acc,$s0);
++
++ &cmp ($s3,0);
++ &je (&label("slow_decrypt"));
++
++#--------------------------- SLOW ENCRYPT ---------------------------#
++ &cmp ($s2,16);
++ &mov ($s3,$s1);
++ &jb (&label("slow_enc_tail"));
++
++ if (!$x86only) {
++ &bt ($_tmp,25); # check for SSE bit
++ &jnc (&label("slow_enc_x86"));
++
++ &movq ("mm0",&QWP(0,$key)); # load iv
++ &movq ("mm4",&QWP(8,$key));
++
++ &set_label("slow_enc_loop_sse",16);
++ &pxor ("mm0",&QWP(0,$acc)); # xor input data
++ &pxor ("mm4",&QWP(8,$acc));
++
++ &mov ($key,$_key);
++ &call ("_sse_AES_encrypt_compact");
++
++ &mov ($acc,$_inp); # load inp
++ &mov ($key,$_out); # load out
++ &mov ($s2,$_len); # load len
++
++ &movq (&QWP(0,$key),"mm0"); # save output data
++ &movq (&QWP(8,$key),"mm4");
++
++ &lea ($acc,&DWP(16,$acc)); # advance inp
++ &mov ($_inp,$acc); # save inp
++ &lea ($s3,&DWP(16,$key)); # advance out
++ &mov ($_out,$s3); # save out
++ &sub ($s2,16); # decrease len
++ &cmp ($s2,16);
++ &mov ($_len,$s2); # save len
++ &jae (&label("slow_enc_loop_sse"));
++ &test ($s2,15);
++ &jnz (&label("slow_enc_tail"));
++ &mov ($acc,$_ivp); # load ivp
++ &movq (&QWP(0,$acc),"mm0"); # save ivec
++ &movq (&QWP(8,$acc),"mm4");
++ &emms ();
++ &mov ("esp",$_esp);
++ &popf ();
++ &function_end_A();
++ &pushf (); # kludge, never executed
++ }
++ &set_label("slow_enc_x86",16);
++ &mov ($s0,&DWP(0,$key)); # load iv
++ &mov ($s1,&DWP(4,$key));
++
++ &set_label("slow_enc_loop_x86",4);
++ &mov ($s2,&DWP(8,$key));
++ &mov ($s3,&DWP(12,$key));
++
++ &xor ($s0,&DWP(0,$acc)); # xor input data
++ &xor ($s1,&DWP(4,$acc));
++ &xor ($s2,&DWP(8,$acc));
++ &xor ($s3,&DWP(12,$acc));
++
++ &mov ($key,$_key); # load key
++ &call ("_x86_AES_encrypt_compact");
++
++ &mov ($acc,$_inp); # load inp
++ &mov ($key,$_out); # load out
++
++ &mov (&DWP(0,$key),$s0); # save output data
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &mov ($s2,$_len); # load len
++ &lea ($acc,&DWP(16,$acc)); # advance inp
++ &mov ($_inp,$acc); # save inp
++ &lea ($s3,&DWP(16,$key)); # advance out
++ &mov ($_out,$s3); # save out
++ &sub ($s2,16); # decrease len
++ &cmp ($s2,16);
++ &mov ($_len,$s2); # save len
++ &jae (&label("slow_enc_loop_x86"));
++ &test ($s2,15);
++ &jnz (&label("slow_enc_tail"));
++ &mov ($acc,$_ivp); # load ivp
++ &mov ($s2,&DWP(8,$key)); # restore last dwords
++ &mov ($s3,&DWP(12,$key));
++ &mov (&DWP(0,$acc),$s0); # save ivec
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++
++ &mov ("esp",$_esp);
++ &popf ();
++ &function_end_A();
++ &pushf (); # kludge, never executed
++
++ &set_label("slow_enc_tail",16);
++ &emms () if (!$x86only);
++ &mov ($key eq "edi"? $key:"",$s3); # load out to edi
++ &mov ($s1,16);
++ &sub ($s1,$s2);
++ &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
++ &je (&label("enc_in_place"));
++ &align (4);
++ &data_word(0xA4F3F689); # rep movsb # copy input
++ &jmp (&label("enc_skip_in_place"));
++ &set_label("enc_in_place");
++ &lea ($key,&DWP(0,$key,$s2));
++ &set_label("enc_skip_in_place");
++ &mov ($s2,$s1);
++ &xor ($s0,$s0);
++ &align (4);
++ &data_word(0xAAF3F689); # rep stosb # zero tail
++
++ &mov ($key,$_ivp); # restore ivp
++ &mov ($acc,$s3); # output as input
++ &mov ($s0,&DWP(0,$key));
++ &mov ($s1,&DWP(4,$key));
++ &mov ($_len,16); # len=16
++ &jmp (&label("slow_enc_loop_x86")); # one more spin...
++
++#--------------------------- SLOW DECRYPT ---------------------------#
++&set_label("slow_decrypt",16);
++ if (!$x86only) {
++ &bt ($_tmp,25); # check for SSE bit
++ &jnc (&label("slow_dec_loop_x86"));
++
++ &set_label("slow_dec_loop_sse",4);
++ &movq ("mm0",&QWP(0,$acc)); # read input
++ &movq ("mm4",&QWP(8,$acc));
++
++ &mov ($key,$_key);
++ &call ("_sse_AES_decrypt_compact");
++
++ &mov ($acc,$_inp); # load inp
++ &lea ($s0,$ivec);
++ &mov ($s1,$_out); # load out
++ &mov ($s2,$_len); # load len
++ &mov ($key,$_ivp); # load ivp
++
++ &movq ("mm1",&QWP(0,$acc)); # re-read input
++ &movq ("mm5",&QWP(8,$acc));
++
++ &pxor ("mm0",&QWP(0,$key)); # xor iv
++ &pxor ("mm4",&QWP(8,$key));
++
++ &movq (&QWP(0,$key),"mm1"); # copy input to iv
++ &movq (&QWP(8,$key),"mm5");
++
++ &sub ($s2,16); # decrease len
++ &jc (&label("slow_dec_partial_sse"));
++
++ &movq (&QWP(0,$s1),"mm0"); # write output
++ &movq (&QWP(8,$s1),"mm4");
++
++ &lea ($s1,&DWP(16,$s1)); # advance out
++ &mov ($_out,$s1); # save out
++ &lea ($acc,&DWP(16,$acc)); # advance inp
++ &mov ($_inp,$acc); # save inp
++ &mov ($_len,$s2); # save len
++ &jnz (&label("slow_dec_loop_sse"));
++ &emms ();
++ &mov ("esp",$_esp);
++ &popf ();
++ &function_end_A();
++ &pushf (); # kludge, never executed
++
++ &set_label("slow_dec_partial_sse",16);
++ &movq (&QWP(0,$s0),"mm0"); # save output to temp
++ &movq (&QWP(8,$s0),"mm4");
++ &emms ();
++
++ &add ($s2 eq "ecx" ? "ecx":"",16);
++ &mov ("edi",$s1); # out
++ &mov ("esi",$s0); # temp
++ &align (4);
++ &data_word(0xA4F3F689); # rep movsb # copy partial output
++
++ &mov ("esp",$_esp);
++ &popf ();
++ &function_end_A();
++ &pushf (); # kludge, never executed
++ }
++ &set_label("slow_dec_loop_x86",16);
++ &mov ($s0,&DWP(0,$acc)); # read input
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++
++ &lea ($key,$ivec);
++ &mov (&DWP(0,$key),$s0); # copy to temp
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &mov ($key,$_key); # load key
++ &call ("_x86_AES_decrypt_compact");
++
++ &mov ($key,$_ivp); # load ivp
++ &mov ($acc,$_len); # load len
++ &xor ($s0,&DWP(0,$key)); # xor iv
++ &xor ($s1,&DWP(4,$key));
++ &xor ($s2,&DWP(8,$key));
++ &xor ($s3,&DWP(12,$key));
++
++ &sub ($acc,16);
++ &jc (&label("slow_dec_partial_x86"));
++
++ &mov ($_len,$acc); # save len
++ &mov ($acc,$_out); # load out
++
++ &mov (&DWP(0,$acc),$s0); # write output
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++
++ &lea ($acc,&DWP(16,$acc)); # advance out
++ &mov ($_out,$acc); # save out
++
++ &lea ($acc,$ivec);
++ &mov ($s0,&DWP(0,$acc)); # read temp
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++
++ &mov (&DWP(0,$key),$s0); # copy it to iv
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &mov ($acc,$_inp); # load inp
++ &lea ($acc,&DWP(16,$acc)); # advance inp
++ &mov ($_inp,$acc); # save inp
++ &jnz (&label("slow_dec_loop_x86"));
++ &mov ("esp",$_esp);
++ &popf ();
++ &function_end_A();
++ &pushf (); # kludge, never executed
++
++ &set_label("slow_dec_partial_x86",16);
++ &lea ($acc,$ivec);
++ &mov (&DWP(0,$acc),$s0); # save output to temp
++ &mov (&DWP(4,$acc),$s1);
++ &mov (&DWP(8,$acc),$s2);
++ &mov (&DWP(12,$acc),$s3);
++
++ &mov ($acc,$_inp);
++ &mov ($s0,&DWP(0,$acc)); # re-read input
++ &mov ($s1,&DWP(4,$acc));
++ &mov ($s2,&DWP(8,$acc));
++ &mov ($s3,&DWP(12,$acc));
++
++ &mov (&DWP(0,$key),$s0); # copy it to iv
++ &mov (&DWP(4,$key),$s1);
++ &mov (&DWP(8,$key),$s2);
++ &mov (&DWP(12,$key),$s3);
++
++ &mov ("ecx",$_len);
++ &mov ("edi",$_out);
++ &lea ("esi",$ivec);
++ &align (4);
++ &data_word(0xA4F3F689); # rep movsb # copy partial output
++
++ &mov ("esp",$_esp);
++ &popf ();
++&function_end("AES_cbc_encrypt");
++}
++
++#------------------------------------------------------------------#
++
++sub enckey()
++{
++ &movz ("esi",&LB("edx")); # rk[i]>>0
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &movz ("esi",&HB("edx")); # rk[i]>>8
++ &shl ("ebx",24);
++ &xor ("eax","ebx");
++
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &shr ("edx",16);
++ &movz ("esi",&LB("edx")); # rk[i]>>16
++ &xor ("eax","ebx");
++
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &movz ("esi",&HB("edx")); # rk[i]>>24
++ &shl ("ebx",8);
++ &xor ("eax","ebx");
++
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &shl ("ebx",16);
++ &xor ("eax","ebx");
++
++ &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
++}
++
++&function_begin("_x86_AES_set_encrypt_key");
++ &mov ("esi",&wparam(1)); # user supplied key
++ &mov ("edi",&wparam(3)); # private key schedule
++
++ &test ("esi",-1);
++ &jz (&label("badpointer"));
++ &test ("edi",-1);
++ &jz (&label("badpointer"));
++
++ &call (&label("pic_point"));
++ &set_label("pic_point");
++ &blindpop($tbl);
++ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
++ &lea ($tbl,&DWP(2048+128,$tbl));
++
++ # prefetch Te4
++ &mov ("eax",&DWP(0-128,$tbl));
++ &mov ("ebx",&DWP(32-128,$tbl));
++ &mov ("ecx",&DWP(64-128,$tbl));
++ &mov ("edx",&DWP(96-128,$tbl));
++ &mov ("eax",&DWP(128-128,$tbl));
++ &mov ("ebx",&DWP(160-128,$tbl));
++ &mov ("ecx",&DWP(192-128,$tbl));
++ &mov ("edx",&DWP(224-128,$tbl));
++
++ &mov ("ecx",&wparam(2)); # number of bits in key
++ &cmp ("ecx",128);
++ &je (&label("10rounds"));
++ &cmp ("ecx",192);
++ &je (&label("12rounds"));
++ &cmp ("ecx",256);
++ &je (&label("14rounds"));
++ &mov ("eax",-2); # invalid number of bits
++ &jmp (&label("exit"));
++
++ &set_label("10rounds");
++ &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
++ &mov ("ebx",&DWP(4,"esi"));
++ &mov ("ecx",&DWP(8,"esi"));
++ &mov ("edx",&DWP(12,"esi"));
++ &mov (&DWP(0,"edi"),"eax");
++ &mov (&DWP(4,"edi"),"ebx");
++ &mov (&DWP(8,"edi"),"ecx");
++ &mov (&DWP(12,"edi"),"edx");
++
++ &xor ("ecx","ecx");
++ &jmp (&label("10shortcut"));
++
++ &align (4);
++ &set_label("10loop");
++ &mov ("eax",&DWP(0,"edi")); # rk[0]
++ &mov ("edx",&DWP(12,"edi")); # rk[3]
++ &set_label("10shortcut");
++ &enckey ();
++
++ &mov (&DWP(16,"edi"),"eax"); # rk[4]
++ &xor ("eax",&DWP(4,"edi"));
++ &mov (&DWP(20,"edi"),"eax"); # rk[5]
++ &xor ("eax",&DWP(8,"edi"));
++ &mov (&DWP(24,"edi"),"eax"); # rk[6]
++ &xor ("eax",&DWP(12,"edi"));
++ &mov (&DWP(28,"edi"),"eax"); # rk[7]
++ &inc ("ecx");
++ &add ("edi",16);
++ &cmp ("ecx",10);
++ &jl (&label("10loop"));
++
++ &mov (&DWP(80,"edi"),10); # setup number of rounds
++ &xor ("eax","eax");
++ &jmp (&label("exit"));
++
++ &set_label("12rounds");
++ &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
++ &mov ("ebx",&DWP(4,"esi"));
++ &mov ("ecx",&DWP(8,"esi"));
++ &mov ("edx",&DWP(12,"esi"));
++ &mov (&DWP(0,"edi"),"eax");
++ &mov (&DWP(4,"edi"),"ebx");
++ &mov (&DWP(8,"edi"),"ecx");
++ &mov (&DWP(12,"edi"),"edx");
++ &mov ("ecx",&DWP(16,"esi"));
++ &mov ("edx",&DWP(20,"esi"));
++ &mov (&DWP(16,"edi"),"ecx");
++ &mov (&DWP(20,"edi"),"edx");
++
++ &xor ("ecx","ecx");
++ &jmp (&label("12shortcut"));
++
++ &align (4);
++ &set_label("12loop");
++ &mov ("eax",&DWP(0,"edi")); # rk[0]
++ &mov ("edx",&DWP(20,"edi")); # rk[5]
++ &set_label("12shortcut");
++ &enckey ();
++
++ &mov (&DWP(24,"edi"),"eax"); # rk[6]
++ &xor ("eax",&DWP(4,"edi"));
++ &mov (&DWP(28,"edi"),"eax"); # rk[7]
++ &xor ("eax",&DWP(8,"edi"));
++ &mov (&DWP(32,"edi"),"eax"); # rk[8]
++ &xor ("eax",&DWP(12,"edi"));
++ &mov (&DWP(36,"edi"),"eax"); # rk[9]
++
++ &cmp ("ecx",7);
++ &je (&label("12break"));
++ &inc ("ecx");
++
++ &xor ("eax",&DWP(16,"edi"));
++ &mov (&DWP(40,"edi"),"eax"); # rk[10]
++ &xor ("eax",&DWP(20,"edi"));
++ &mov (&DWP(44,"edi"),"eax"); # rk[11]
++
++ &add ("edi",24);
++ &jmp (&label("12loop"));
++
++ &set_label("12break");
++ &mov (&DWP(72,"edi"),12); # setup number of rounds
++ &xor ("eax","eax");
++ &jmp (&label("exit"));
++
++ &set_label("14rounds");
++ &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
++ &mov ("ebx",&DWP(4,"esi"));
++ &mov ("ecx",&DWP(8,"esi"));
++ &mov ("edx",&DWP(12,"esi"));
++ &mov (&DWP(0,"edi"),"eax");
++ &mov (&DWP(4,"edi"),"ebx");
++ &mov (&DWP(8,"edi"),"ecx");
++ &mov (&DWP(12,"edi"),"edx");
++ &mov ("eax",&DWP(16,"esi"));
++ &mov ("ebx",&DWP(20,"esi"));
++ &mov ("ecx",&DWP(24,"esi"));
++ &mov ("edx",&DWP(28,"esi"));
++ &mov (&DWP(16,"edi"),"eax");
++ &mov (&DWP(20,"edi"),"ebx");
++ &mov (&DWP(24,"edi"),"ecx");
++ &mov (&DWP(28,"edi"),"edx");
++
++ &xor ("ecx","ecx");
++ &jmp (&label("14shortcut"));
++
++ &align (4);
++ &set_label("14loop");
++ &mov ("edx",&DWP(28,"edi")); # rk[7]
++ &set_label("14shortcut");
++ &mov ("eax",&DWP(0,"edi")); # rk[0]
++
++ &enckey ();
++
++ &mov (&DWP(32,"edi"),"eax"); # rk[8]
++ &xor ("eax",&DWP(4,"edi"));
++ &mov (&DWP(36,"edi"),"eax"); # rk[9]
++ &xor ("eax",&DWP(8,"edi"));
++ &mov (&DWP(40,"edi"),"eax"); # rk[10]
++ &xor ("eax",&DWP(12,"edi"));
++ &mov (&DWP(44,"edi"),"eax"); # rk[11]
++
++ &cmp ("ecx",6);
++ &je (&label("14break"));
++ &inc ("ecx");
++
++ &mov ("edx","eax");
++ &mov ("eax",&DWP(16,"edi")); # rk[4]
++ &movz ("esi",&LB("edx")); # rk[11]>>0
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &movz ("esi",&HB("edx")); # rk[11]>>8
++ &xor ("eax","ebx");
++
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &shr ("edx",16);
++ &shl ("ebx",8);
++ &movz ("esi",&LB("edx")); # rk[11]>>16
++ &xor ("eax","ebx");
++
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &movz ("esi",&HB("edx")); # rk[11]>>24
++ &shl ("ebx",16);
++ &xor ("eax","ebx");
++
++ &movz ("ebx",&BP(-128,$tbl,"esi",1));
++ &shl ("ebx",24);
++ &xor ("eax","ebx");
++
++ &mov (&DWP(48,"edi"),"eax"); # rk[12]
++ &xor ("eax",&DWP(20,"edi"));
++ &mov (&DWP(52,"edi"),"eax"); # rk[13]
++ &xor ("eax",&DWP(24,"edi"));
++ &mov (&DWP(56,"edi"),"eax"); # rk[14]
++ &xor ("eax",&DWP(28,"edi"));
++ &mov (&DWP(60,"edi"),"eax"); # rk[15]
++
++ &add ("edi",32);
++ &jmp (&label("14loop"));
++
++ &set_label("14break");
++ &mov (&DWP(48,"edi"),14); # setup number of rounds
++ &xor ("eax","eax");
++ &jmp (&label("exit"));
++
++ &set_label("badpointer");
++ &mov ("eax",-1);
++ &set_label("exit");
++&function_end("_x86_AES_set_encrypt_key");
++
++# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
++# AES_KEY *key)
++&function_begin_B("AES_set_encrypt_key");
++ &call ("_x86_AES_set_encrypt_key");
++ &ret ();
++&function_end_B("AES_set_encrypt_key");
++
++sub deckey()
++{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
++ my $tmp = $tbl;
++
++ &mov ($tmp,0x80808080);
++ &and ($tmp,$tp1);
++ &lea ($tp2,&DWP(0,$tp1,$tp1));
++ &mov ($acc,$tmp);
++ &shr ($tmp,7);
++ &sub ($acc,$tmp);
++ &and ($tp2,0xfefefefe);
++ &and ($acc,0x1b1b1b1b);
++ &xor ($tp2,$acc);
++ &mov ($tmp,0x80808080);
++
++ &and ($tmp,$tp2);
++ &lea ($tp4,&DWP(0,$tp2,$tp2));
++ &mov ($acc,$tmp);
++ &shr ($tmp,7);
++ &sub ($acc,$tmp);
++ &and ($tp4,0xfefefefe);
++ &and ($acc,0x1b1b1b1b);
++ &xor ($tp2,$tp1); # tp2^tp1
++ &xor ($tp4,$acc);
++ &mov ($tmp,0x80808080);
++
++ &and ($tmp,$tp4);
++ &lea ($tp8,&DWP(0,$tp4,$tp4));
++ &mov ($acc,$tmp);
++ &shr ($tmp,7);
++ &xor ($tp4,$tp1); # tp4^tp1
++ &sub ($acc,$tmp);
++ &and ($tp8,0xfefefefe);
++ &and ($acc,0x1b1b1b1b);
++ &rotl ($tp1,8); # = ROTATE(tp1,8)
++ &xor ($tp8,$acc);
++
++ &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
++
++ &xor ($tp1,$tp2);
++ &xor ($tp2,$tp8);
++ &xor ($tp1,$tp4);
++ &rotl ($tp2,24);
++ &xor ($tp4,$tp8);
++ &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
++ &rotl ($tp4,16);
++ &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
++ &rotl ($tp8,8);
++ &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
++ &mov ($tp2,$tmp);
++ &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
++
++ &mov (&DWP(4*$i,$key),$tp1);
++}
++
++# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
++# AES_KEY *key)
++&function_begin_B("AES_set_decrypt_key");
++ &call ("_x86_AES_set_encrypt_key");
++ &cmp ("eax",0);
++ &je (&label("proceed"));
++ &ret ();
++
++ &set_label("proceed");
++ &push ("ebp");
++ &push ("ebx");
++ &push ("esi");
++ &push ("edi");
++
++ &mov ("esi",&wparam(2));
++ &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
++ &lea ("ecx",&DWP(0,"","ecx",4));
++ &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
++
++ &set_label("invert",4); # invert order of chunks
++ &mov ("eax",&DWP(0,"esi"));
++ &mov ("ebx",&DWP(4,"esi"));
++ &mov ("ecx",&DWP(0,"edi"));
++ &mov ("edx",&DWP(4,"edi"));
++ &mov (&DWP(0,"edi"),"eax");
++ &mov (&DWP(4,"edi"),"ebx");
++ &mov (&DWP(0,"esi"),"ecx");
++ &mov (&DWP(4,"esi"),"edx");
++ &mov ("eax",&DWP(8,"esi"));
++ &mov ("ebx",&DWP(12,"esi"));
++ &mov ("ecx",&DWP(8,"edi"));
++ &mov ("edx",&DWP(12,"edi"));
++ &mov (&DWP(8,"edi"),"eax");
++ &mov (&DWP(12,"edi"),"ebx");
++ &mov (&DWP(8,"esi"),"ecx");
++ &mov (&DWP(12,"esi"),"edx");
++ &add ("esi",16);
++ &sub ("edi",16);
++ &cmp ("esi","edi");
++ &jne (&label("invert"));
++
++ &mov ($key,&wparam(2));
++ &mov ($acc,&DWP(240,$key)); # pull number of rounds
++ &lea ($acc,&DWP(-2,$acc,$acc));
++ &lea ($acc,&DWP(0,$key,$acc,8));
++ &mov (&wparam(2),$acc);
++
++ &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
++ &set_label("permute",4); # permute the key schedule
++ &add ($key,16);
++ &deckey (0,$key,$s0,$s1,$s2,$s3);
++ &deckey (1,$key,$s1,$s2,$s3,$s0);
++ &deckey (2,$key,$s2,$s3,$s0,$s1);
++ &deckey (3,$key,$s3,$s0,$s1,$s2);
++ &cmp ($key,&wparam(2));
++ &jb (&label("permute"));
++
++ &xor ("eax","eax"); # return success
++&function_end("AES_set_decrypt_key");
++&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
++
++&asm_finish();
++
++close STDOUT;
+diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
+new file mode 100755
+index 0000000000..d87e201147
+--- /dev/null
++++ b/crypto/aes/asm/aes-x86_64.pl
+@@ -0,0 +1,2916 @@
++#! /usr/bin/env perl
++# Copyright 2005-2019 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the OpenSSL license (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# Version 2.1.
++#
++# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
++# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
++# [you'll notice a lot of resemblance], such as compressed S-boxes
++# in little-endian byte order, prefetch of these tables in CBC mode,
++# as well as avoiding L1 cache aliasing between stack frame and key
++# schedule and already mentioned tables, compressed Td4...
++#
++# Performance in number of cycles per processed byte for 128-bit key:
++#
++# ECB encrypt ECB decrypt CBC large chunk
++# AMD64 33 43 13.0
++# EM64T 38 56 18.6(*)
++# Core 2 30 42 14.5(*)
++# Atom 65 86 32.1(*)
++#
++# (*) with hyper-threading off
++
++$flavour = shift;
++$output = shift;
++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
++
++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
++die "can't locate x86_64-xlate.pl";
++
++open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
++*STDOUT=*OUT;
++
++$verticalspin=1; # unlike 32-bit version $verticalspin performs
++ # ~15% better on both AMD and Intel cores
++$speed_limit=512; # see aes-586.pl for details
++
++$code=".text\n";
++
++$s0="%eax";
++$s1="%ebx";
++$s2="%ecx";
++$s3="%edx";
++$acc0="%esi"; $mask80="%rsi";
++$acc1="%edi"; $maskfe="%rdi";
++$acc2="%ebp"; $mask1b="%rbp";
++$inp="%r8";
++$out="%r9";
++$t0="%r10d";
++$t1="%r11d";
++$t2="%r12d";
++$rnds="%r13d";
++$sbox="%r14";
++$key="%r15";
++
++sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
++sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
++ $r =~ s/%[er]([sd]i)/%\1l/;
++ $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
++sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
++ $r =~ s/%r([0-9]+)/%r\1d/; $r; }
++sub _data_word()
++{ my $i;
++ while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
++}
++sub data_word()
++{ my $i;
++ my $last=pop(@_);
++ $code.=".long\t";
++ while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
++ $code.=sprintf"0x%08x\n",$last;
++}
++
++sub data_byte()
++{ my $i;
++ my $last=pop(@_);
++ $code.=".byte\t";
++ while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
++ $code.=sprintf"0x%02x\n",$last&0xff;
++}
++
++sub encvert()
++{ my $t3="%r8d"; # zaps $inp!
++
++$code.=<<___;
++ # favor 3-way issue Opteron pipeline...
++ movzb `&lo("$s0")`,$acc0
++ movzb `&lo("$s1")`,$acc1
++ movzb `&lo("$s2")`,$acc2
++ mov 0($sbox,$acc0,8),$t0
++ mov 0($sbox,$acc1,8),$t1
++ mov 0($sbox,$acc2,8),$t2
++
++ movzb `&hi("$s1")`,$acc0
++ movzb `&hi("$s2")`,$acc1
++ movzb `&lo("$s3")`,$acc2
++ xor 3($sbox,$acc0,8),$t0
++ xor 3($sbox,$acc1,8),$t1
++ mov 0($sbox,$acc2,8),$t3
++
++ movzb `&hi("$s3")`,$acc0
++ shr \$16,$s2
++ movzb `&hi("$s0")`,$acc2
++ xor 3($sbox,$acc0,8),$t2
++ shr \$16,$s3
++ xor 3($sbox,$acc2,8),$t3
++
++ shr \$16,$s1
++ lea 16($key),$key
++ shr \$16,$s0
++
++ movzb `&lo("$s2")`,$acc0
++ movzb `&lo("$s3")`,$acc1
++ movzb `&lo("$s0")`,$acc2
++ xor 2($sbox,$acc0,8),$t0
++ xor 2($sbox,$acc1,8),$t1
++ xor 2($sbox,$acc2,8),$t2
++
++ movzb `&hi("$s3")`,$acc0
++ movzb `&hi("$s0")`,$acc1
++ movzb `&lo("$s1")`,$acc2
++ xor 1($sbox,$acc0,8),$t0
++ xor 1($sbox,$acc1,8),$t1
++ xor 2($sbox,$acc2,8),$t3
++
++ mov 12($key),$s3
++ movzb `&hi("$s1")`,$acc1
++ movzb `&hi("$s2")`,$acc2
++ mov 0($key),$s0
++ xor 1($sbox,$acc1,8),$t2
++ xor 1($sbox,$acc2,8),$t3
++
++ mov 4($key),$s1
++ mov 8($key),$s2
++ xor $t0,$s0
++ xor $t1,$s1
++ xor $t2,$s2
++ xor $t3,$s3
++___
++}
++
++sub enclastvert()
++{ my $t3="%r8d"; # zaps $inp!
++
++$code.=<<___;
++ movzb `&lo("$s0")`,$acc0
++ movzb `&lo("$s1")`,$acc1
++ movzb `&lo("$s2")`,$acc2
++ movzb 2($sbox,$acc0,8),$t0
++ movzb 2($sbox,$acc1,8),$t1
++ movzb 2($sbox,$acc2,8),$t2
++
++ movzb `&lo("$s3")`,$acc0
++ movzb `&hi("$s1")`,$acc1
++ movzb `&hi("$s2")`,$acc2
++ movzb 2($sbox,$acc0,8),$t3
++ mov 0($sbox,$acc1,8),$acc1 #$t0
++ mov 0($sbox,$acc2,8),$acc2 #$t1
++
++ and \$0x0000ff00,$acc1
++ and \$0x0000ff00,$acc2
++
++ xor $acc1,$t0
++ xor $acc2,$t1
++ shr \$16,$s2
++
++ movzb `&hi("$s3")`,$acc0
++ movzb `&hi("$s0")`,$acc1
++ shr \$16,$s3
++ mov 0($sbox,$acc0,8),$acc0 #$t2
++ mov 0($sbox,$acc1,8),$acc1 #$t3
++
++ and \$0x0000ff00,$acc0
++ and \$0x0000ff00,$acc1
++ shr \$16,$s1
++ xor $acc0,$t2
++ xor $acc1,$t3
++ shr \$16,$s0
++
++ movzb `&lo("$s2")`,$acc0
++ movzb `&lo("$s3")`,$acc1
++ movzb `&lo("$s0")`,$acc2
++ mov 0($sbox,$acc0,8),$acc0 #$t0
++ mov 0($sbox,$acc1,8),$acc1 #$t1
++ mov 0($sbox,$acc2,8),$acc2 #$t2
++
++ and \$0x00ff0000,$acc0
++ and \$0x00ff0000,$acc1
++ and \$0x00ff0000,$acc2
++
++ xor $acc0,$t0
++ xor $acc1,$t1
++ xor $acc2,$t2
++
++ movzb `&lo("$s1")`,$acc0
++ movzb `&hi("$s3")`,$acc1
++ movzb `&hi("$s0")`,$acc2
++ mov 0($sbox,$acc0,8),$acc0 #$t3
++ mov 2($sbox,$acc1,8),$acc1 #$t0
++ mov 2($sbox,$acc2,8),$acc2 #$t1
++
++ and \$0x00ff0000,$acc0
++ and \$0xff000000,$acc1
++ and \$0xff000000,$acc2
++
++ xor $acc0,$t3
++ xor $acc1,$t0
++ xor $acc2,$t1
++
++ movzb `&hi("$s1")`,$acc0
++ movzb `&hi("$s2")`,$acc1
++ mov 16+12($key),$s3
++ mov 2($sbox,$acc0,8),$acc0 #$t2
++ mov 2($sbox,$acc1,8),$acc1 #$t3
++ mov 16+0($key),$s0
++
++ and \$0xff000000,$acc0
++ and \$0xff000000,$acc1
++
++ xor $acc0,$t2
++ xor $acc1,$t3
++
++ mov 16+4($key),$s1
++ mov 16+8($key),$s2
++ xor $t0,$s0
++ xor $t1,$s1
++ xor $t2,$s2
++ xor $t3,$s3
++___
++}
++
++sub encstep()
++{ my ($i,@s) = @_;
++ my $tmp0=$acc0;
++ my $tmp1=$acc1;
++ my $tmp2=$acc2;
++ my $out=($t0,$t1,$t2,$s[0])[$i];
++
++ if ($i==3) {
++ $tmp0=$s[1];
++ $tmp1=$s[2];
++ $tmp2=$s[3];
++ }
++ $code.=" movzb ".&lo($s[0]).",$out\n";
++ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
++ $code.=" lea 16($key),$key\n" if ($i==0);
++
++ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
++ $code.=" mov 0($sbox,$out,8),$out\n";
++
++ $code.=" shr \$16,$tmp1\n";
++ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
++ $code.=" xor 3($sbox,$tmp0,8),$out\n";
++
++ $code.=" movzb ".&lo($tmp1).",$tmp1\n";
++ $code.=" shr \$24,$tmp2\n";
++ $code.=" xor 4*$i($key),$out\n";
++
++ $code.=" xor 2($sbox,$tmp1,8),$out\n";
++ $code.=" xor 1($sbox,$tmp2,8),$out\n";
++
++ $code.=" mov $t0,$s[1]\n" if ($i==3);
++ $code.=" mov $t1,$s[2]\n" if ($i==3);
++ $code.=" mov $t2,$s[3]\n" if ($i==3);
++ $code.="\n";
++}
++
++sub enclast()
++{ my ($i,@s)=@_;
++ my $tmp0=$acc0;
++ my $tmp1=$acc1;
++ my $tmp2=$acc2;
++ my $out=($t0,$t1,$t2,$s[0])[$i];
++
++ if ($i==3) {
++ $tmp0=$s[1];
++ $tmp1=$s[2];
++ $tmp2=$s[3];
++ }
++ $code.=" movzb ".&lo($s[0]).",$out\n";
++ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
++
++ $code.=" mov 2($sbox,$out,8),$out\n";
++ $code.=" shr \$16,$tmp1\n";
++ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
++
++ $code.=" and \$0x000000ff,$out\n";
++ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
++ $code.=" movzb ".&lo($tmp1).",$tmp1\n";
++ $code.=" shr \$24,$tmp2\n";
++
++ $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
++ $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
++ $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
++
++ $code.=" and \$0x0000ff00,$tmp0\n";
++ $code.=" and \$0x00ff0000,$tmp1\n";
++ $code.=" and \$0xff000000,$tmp2\n";
++
++ $code.=" xor $tmp0,$out\n";
++ $code.=" mov $t0,$s[1]\n" if ($i==3);
++ $code.=" xor $tmp1,$out\n";
++ $code.=" mov $t1,$s[2]\n" if ($i==3);
++ $code.=" xor $tmp2,$out\n";
++ $code.=" mov $t2,$s[3]\n" if ($i==3);
++ $code.="\n";
++}
++
++$code.=<<___;
++.type _x86_64_AES_encrypt,\@abi-omnipotent
++.align 16
++_x86_64_AES_encrypt:
++ xor 0($key),$s0 # xor with key
++ xor 4($key),$s1
++ xor 8($key),$s2
++ xor 12($key),$s3
++
++ mov 240($key),$rnds # load key->rounds
++ sub \$1,$rnds
++ jmp .Lenc_loop
++.align 16
++.Lenc_loop:
++___
++ if ($verticalspin) { &encvert(); }
++ else { &encstep(0,$s0,$s1,$s2,$s3);
++ &encstep(1,$s1,$s2,$s3,$s0);
++ &encstep(2,$s2,$s3,$s0,$s1);
++ &encstep(3,$s3,$s0,$s1,$s2);
++ }
++$code.=<<___;
++ sub \$1,$rnds
++ jnz .Lenc_loop
++___
++ if ($verticalspin) { &enclastvert(); }
++ else { &enclast(0,$s0,$s1,$s2,$s3);
++ &enclast(1,$s1,$s2,$s3,$s0);
++ &enclast(2,$s2,$s3,$s0,$s1);
++ &enclast(3,$s3,$s0,$s1,$s2);
++ $code.=<<___;
++ xor 16+0($key),$s0 # xor with key
++ xor 16+4($key),$s1
++ xor 16+8($key),$s2
++ xor 16+12($key),$s3
++___
++ }
++$code.=<<___;
++ .byte 0xf3,0xc3 # rep ret
++.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
++___
++
++# it's possible to implement this by shifting tN by 8, filling least
++# significant byte with byte load and finally bswap-ing at the end,
++# but such partial register load kills Core 2...
++sub enccompactvert()
++{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
++
++$code.=<<___;
++ movzb `&lo("$s0")`,$t0
++ movzb `&lo("$s1")`,$t1
++ movzb `&lo("$s2")`,$t2
++ movzb `&lo("$s3")`,$t3
++ movzb `&hi("$s1")`,$acc0
++ movzb `&hi("$s2")`,$acc1
++ shr \$16,$s2
++ movzb `&hi("$s3")`,$acc2
++ movzb ($sbox,$t0,1),$t0
++ movzb ($sbox,$t1,1),$t1
++ movzb ($sbox,$t2,1),$t2
++ movzb ($sbox,$t3,1),$t3
++
++ movzb ($sbox,$acc0,1),$t4 #$t0
++ movzb `&hi("$s0")`,$acc0
++ movzb ($sbox,$acc1,1),$t5 #$t1
++ movzb `&lo("$s2")`,$acc1
++ movzb ($sbox,$acc2,1),$acc2 #$t2
++ movzb ($sbox,$acc0,1),$acc0 #$t3
++
++ shl \$8,$t4
++ shr \$16,$s3
++ shl \$8,$t5
++ xor $t4,$t0
++ shr \$16,$s0
++ movzb `&lo("$s3")`,$t4
++ shr \$16,$s1
++ xor $t5,$t1
++ shl \$8,$acc2
++ movzb `&lo("$s0")`,$t5
++ movzb ($sbox,$acc1,1),$acc1 #$t0
++ xor $acc2,$t2
++
++ shl \$8,$acc0
++ movzb `&lo("$s1")`,$acc2
++ shl \$16,$acc1
++ xor $acc0,$t3
++ movzb ($sbox,$t4,1),$t4 #$t1
++ movzb `&hi("$s3")`,$acc0
++ movzb ($sbox,$t5,1),$t5 #$t2
++ xor $acc1,$t0
++
++ shr \$8,$s2
++ movzb `&hi("$s0")`,$acc1
++ shl \$16,$t4
++ shr \$8,$s1
++ shl \$16,$t5
++ xor $t4,$t1
++ movzb ($sbox,$acc2,1),$acc2 #$t3
++ movzb ($sbox,$acc0,1),$acc0 #$t0
++ movzb ($sbox,$acc1,1),$acc1 #$t1
++ movzb ($sbox,$s2,1),$s3 #$t3
++ movzb ($sbox,$s1,1),$s2 #$t2
++
++ shl \$16,$acc2
++ xor $t5,$t2
++ shl \$24,$acc0
++ xor $acc2,$t3
++ shl \$24,$acc1
++ xor $acc0,$t0
++ shl \$24,$s3
++ xor $acc1,$t1
++ shl \$24,$s2
++ mov $t0,$s0
++ mov $t1,$s1
++ xor $t2,$s2
++ xor $t3,$s3
++___
++}
++
++sub enctransform_ref()
++{ my $sn = shift;
++ my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
++
++$code.=<<___;
++ mov $sn,$acc
++ and \$0x80808080,$acc
++ mov $acc,$tmp
++ shr \$7,$tmp
++ lea ($sn,$sn),$r2
++ sub $tmp,$acc
++ and \$0xfefefefe,$r2
++ and \$0x1b1b1b1b,$acc
++ mov $sn,$tmp
++ xor $acc,$r2
++
++ xor $r2,$sn
++ rol \$24,$sn
++ xor $r2,$sn
++ ror \$16,$tmp
++ xor $tmp,$sn
++ ror \$8,$tmp
++ xor $tmp,$sn
++___
++}
++
++# unlike decrypt case it does not pay off to parallelize enctransform
++sub enctransform()
++{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
++
++$code.=<<___;
++ mov \$0x80808080,$t0
++ mov \$0x80808080,$t1
++ and $s0,$t0
++ and $s1,$t1
++ mov $t0,$acc0
++ mov $t1,$acc1
++ shr \$7,$t0
++ lea ($s0,$s0),$r20
++ shr \$7,$t1
++ lea ($s1,$s1),$r21
++ sub $t0,$acc0
++ sub $t1,$acc1
++ and \$0xfefefefe,$r20
++ and \$0xfefefefe,$r21
++ and \$0x1b1b1b1b,$acc0
++ and \$0x1b1b1b1b,$acc1
++ mov $s0,$t0
++ mov $s1,$t1
++ xor $acc0,$r20
++ xor $acc1,$r21
++
++ xor $r20,$s0
++ xor $r21,$s1
++ mov \$0x80808080,$t2
++ rol \$24,$s0
++ mov \$0x80808080,$t3
++ rol \$24,$s1
++ and $s2,$t2
++ and $s3,$t3
++ xor $r20,$s0
++ xor $r21,$s1
++ mov $t2,$acc0
++ ror \$16,$t0
++ mov $t3,$acc1
++ ror \$16,$t1
++ lea ($s2,$s2),$r20
++ shr \$7,$t2
++ xor $t0,$s0
++ shr \$7,$t3
++ xor $t1,$s1
++ ror \$8,$t0
++ lea ($s3,$s3),$r21
++ ror \$8,$t1
++ sub $t2,$acc0
++ sub $t3,$acc1
++ xor $t0,$s0
++ xor $t1,$s1
++
++ and \$0xfefefefe,$r20
++ and \$0xfefefefe,$r21
++ and \$0x1b1b1b1b,$acc0
++ and \$0x1b1b1b1b,$acc1
++ mov $s2,$t2
++ mov $s3,$t3
++ xor $acc0,$r20
++ xor $acc1,$r21
++
++ ror \$16,$t2
++ xor $r20,$s2
++ ror \$16,$t3
++ xor $r21,$s3
++ rol \$24,$s2
++ mov 0($sbox),$acc0 # prefetch Te4
++ rol \$24,$s3
++ xor $r20,$s2
++ mov 64($sbox),$acc1
++ xor $r21,$s3
++ mov 128($sbox),$r20
++ xor $t2,$s2
++ ror \$8,$t2
++ xor $t3,$s3
++ ror \$8,$t3
++ xor $t2,$s2
++ mov 192($sbox),$r21
++ xor $t3,$s3
++___
++}
++
++$code.=<<___;
++.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
++.align 16
++_x86_64_AES_encrypt_compact:
++.cfi_startproc
++ lea 128($sbox),$inp # size optimization
++ mov 0-128($inp),$acc1 # prefetch Te4
++ mov 32-128($inp),$acc2
++ mov 64-128($inp),$t0
++ mov 96-128($inp),$t1
++ mov 128-128($inp),$acc1
++ mov 160-128($inp),$acc2
++ mov 192-128($inp),$t0
++ mov 224-128($inp),$t1
++ jmp .Lenc_loop_compact
++.align 16
++.Lenc_loop_compact:
++ xor 0($key),$s0 # xor with key
++ xor 4($key),$s1
++ xor 8($key),$s2
++ xor 12($key),$s3
++ lea 16($key),$key
++___
++ &enccompactvert();
++$code.=<<___;
++ cmp 16(%rsp),$key
++ je .Lenc_compact_done
++___
++ &enctransform();
++$code.=<<___;
++ jmp .Lenc_loop_compact
++.align 16
++.Lenc_compact_done:
++ xor 0($key),$s0
++ xor 4($key),$s1
++ xor 8($key),$s2
++ xor 12($key),$s3
++ .byte 0xf3,0xc3 # rep ret
++.cfi_endproc
++.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
++___
++
++# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
++$code.=<<___;
++.globl AES_encrypt
++.type AES_encrypt,\@function,3
++.align 16
++.globl asm_AES_encrypt
++.hidden asm_AES_encrypt
++asm_AES_encrypt:
++AES_encrypt:
++.cfi_startproc
++ mov %rsp,%rax
++.cfi_def_cfa_register %rax
++ push %rbx
++.cfi_push %rbx
++ push %rbp
++.cfi_push %rbp
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++
++ # allocate frame "above" key schedule
++ lea -63(%rdx),%rcx # %rdx is key argument
++ and \$-64,%rsp
++ sub %rsp,%rcx
++ neg %rcx
++ and \$0x3c0,%rcx
++ sub %rcx,%rsp
++ sub \$32,%rsp
++
++ mov %rsi,16(%rsp) # save out
++ mov %rax,24(%rsp) # save original stack pointer
++.cfi_cfa_expression %rsp+24,deref,+8
++.Lenc_prologue:
++
++ mov %rdx,$key
++ mov 240($key),$rnds # load rounds
++
++ mov 0(%rdi),$s0 # load input vector
++ mov 4(%rdi),$s1
++ mov 8(%rdi),$s2
++ mov 12(%rdi),$s3
++
++ shl \$4,$rnds
++ lea ($key,$rnds),%rbp
++ mov $key,(%rsp) # key schedule
++ mov %rbp,8(%rsp) # end of key schedule
++
++ # pick Te4 copy which can't "overlap" with stack frame or key schedule
++ lea .LAES_Te+2048(%rip),$sbox
++ lea 768(%rsp),%rbp
++ sub $sbox,%rbp
++ and \$0x300,%rbp
++ lea ($sbox,%rbp),$sbox
++
++ call _x86_64_AES_encrypt_compact
++
++ mov 16(%rsp),$out # restore out
++ mov 24(%rsp),%rsi # restore saved stack pointer
++.cfi_def_cfa %rsi,8
++ mov $s0,0($out) # write output vector
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ mov -48(%rsi),%r15
++.cfi_restore %r15
++ mov -40(%rsi),%r14
++.cfi_restore %r14
++ mov -32(%rsi),%r13
++.cfi_restore %r13
++ mov -24(%rsi),%r12
++.cfi_restore %r12
++ mov -16(%rsi),%rbp
++.cfi_restore %rbp
++ mov -8(%rsi),%rbx
++.cfi_restore %rbx
++ lea (%rsi),%rsp
++.cfi_def_cfa_register %rsp
++.Lenc_epilogue:
++ ret
++.cfi_endproc
++.size AES_encrypt,.-AES_encrypt
++___
++
++#------------------------------------------------------------------#
++
++sub decvert()
++{ my $t3="%r8d"; # zaps $inp!
++
++$code.=<<___;
++ # favor 3-way issue Opteron pipeline...
++ movzb `&lo("$s0")`,$acc0
++ movzb `&lo("$s1")`,$acc1
++ movzb `&lo("$s2")`,$acc2
++ mov 0($sbox,$acc0,8),$t0
++ mov 0($sbox,$acc1,8),$t1
++ mov 0($sbox,$acc2,8),$t2
++
++ movzb `&hi("$s3")`,$acc0
++ movzb `&hi("$s0")`,$acc1
++ movzb `&lo("$s3")`,$acc2
++ xor 3($sbox,$acc0,8),$t0
++ xor 3($sbox,$acc1,8),$t1
++ mov 0($sbox,$acc2,8),$t3
++
++ movzb `&hi("$s1")`,$acc0
++ shr \$16,$s0
++ movzb `&hi("$s2")`,$acc2
++ xor 3($sbox,$acc0,8),$t2
++ shr \$16,$s3
++ xor 3($sbox,$acc2,8),$t3
++
++ shr \$16,$s1
++ lea 16($key),$key
++ shr \$16,$s2
++
++ movzb `&lo("$s2")`,$acc0
++ movzb `&lo("$s3")`,$acc1
++ movzb `&lo("$s0")`,$acc2
++ xor 2($sbox,$acc0,8),$t0
++ xor 2($sbox,$acc1,8),$t1
++ xor 2($sbox,$acc2,8),$t2
++
++ movzb `&hi("$s1")`,$acc0
++ movzb `&hi("$s2")`,$acc1
++ movzb `&lo("$s1")`,$acc2
++ xor 1($sbox,$acc0,8),$t0
++ xor 1($sbox,$acc1,8),$t1
++ xor 2($sbox,$acc2,8),$t3
++
++ movzb `&hi("$s3")`,$acc0
++ mov 12($key),$s3
++ movzb `&hi("$s0")`,$acc2
++ xor 1($sbox,$acc0,8),$t2
++ mov 0($key),$s0
++ xor 1($sbox,$acc2,8),$t3
++
++ xor $t0,$s0
++ mov 4($key),$s1
++ mov 8($key),$s2
++ xor $t2,$s2
++ xor $t1,$s1
++ xor $t3,$s3
++___
++}
++
++sub declastvert()
++{ my $t3="%r8d"; # zaps $inp!
++
++$code.=<<___;
++ lea 2048($sbox),$sbox # size optimization
++ movzb `&lo("$s0")`,$acc0
++ movzb `&lo("$s1")`,$acc1
++ movzb `&lo("$s2")`,$acc2
++ movzb ($sbox,$acc0,1),$t0
++ movzb ($sbox,$acc1,1),$t1
++ movzb ($sbox,$acc2,1),$t2
++
++ movzb `&lo("$s3")`,$acc0
++ movzb `&hi("$s3")`,$acc1
++ movzb `&hi("$s0")`,$acc2
++ movzb ($sbox,$acc0,1),$t3
++ movzb ($sbox,$acc1,1),$acc1 #$t0
++ movzb ($sbox,$acc2,1),$acc2 #$t1
++
++ shl \$8,$acc1
++ shl \$8,$acc2
++
++ xor $acc1,$t0
++ xor $acc2,$t1
++ shr \$16,$s3
++
++ movzb `&hi("$s1")`,$acc0
++ movzb `&hi("$s2")`,$acc1
++ shr \$16,$s0
++ movzb ($sbox,$acc0,1),$acc0 #$t2
++ movzb ($sbox,$acc1,1),$acc1 #$t3
++
++ shl \$8,$acc0
++ shl \$8,$acc1
++ shr \$16,$s1
++ xor $acc0,$t2
++ xor $acc1,$t3
++ shr \$16,$s2
++
++ movzb `&lo("$s2")`,$acc0
++ movzb `&lo("$s3")`,$acc1
++ movzb `&lo("$s0")`,$acc2
++ movzb ($sbox,$acc0,1),$acc0 #$t0
++ movzb ($sbox,$acc1,1),$acc1 #$t1
++ movzb ($sbox,$acc2,1),$acc2 #$t2
++
++ shl \$16,$acc0
++ shl \$16,$acc1
++ shl \$16,$acc2
++
++ xor $acc0,$t0
++ xor $acc1,$t1
++ xor $acc2,$t2
++
++ movzb `&lo("$s1")`,$acc0
++ movzb `&hi("$s1")`,$acc1
++ movzb `&hi("$s2")`,$acc2
++ movzb ($sbox,$acc0,1),$acc0 #$t3
++ movzb ($sbox,$acc1,1),$acc1 #$t0
++ movzb ($sbox,$acc2,1),$acc2 #$t1
++
++ shl \$16,$acc0
++ shl \$24,$acc1
++ shl \$24,$acc2
++
++ xor $acc0,$t3
++ xor $acc1,$t0
++ xor $acc2,$t1
++
++ movzb `&hi("$s3")`,$acc0
++ movzb `&hi("$s0")`,$acc1
++ mov 16+12($key),$s3
++ movzb ($sbox,$acc0,1),$acc0 #$t2
++ movzb ($sbox,$acc1,1),$acc1 #$t3
++ mov 16+0($key),$s0
++
++ shl \$24,$acc0
++ shl \$24,$acc1
++
++ xor $acc0,$t2
++ xor $acc1,$t3
++
++ mov 16+4($key),$s1
++ mov 16+8($key),$s2
++ lea -2048($sbox),$sbox
++ xor $t0,$s0
++ xor $t1,$s1
++ xor $t2,$s2
++ xor $t3,$s3
++___
++}
++
++sub decstep()
++{ my ($i,@s) = @_;
++ my $tmp0=$acc0;
++ my $tmp1=$acc1;
++ my $tmp2=$acc2;
++ my $out=($t0,$t1,$t2,$s[0])[$i];
++
++ $code.=" mov $s[0],$out\n" if ($i!=3);
++ $tmp1=$s[2] if ($i==3);
++ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
++ $code.=" and \$0xFF,$out\n";
++
++ $code.=" mov 0($sbox,$out,8),$out\n";
++ $code.=" shr \$16,$tmp1\n";
++ $tmp2=$s[3] if ($i==3);
++ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
++
++ $tmp0=$s[1] if ($i==3);
++ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
++ $code.=" and \$0xFF,$tmp1\n";
++ $code.=" shr \$24,$tmp2\n";
++
++ $code.=" xor 3($sbox,$tmp0,8),$out\n";
++ $code.=" xor 2($sbox,$tmp1,8),$out\n";
++ $code.=" xor 1($sbox,$tmp2,8),$out\n";
++
++ $code.=" mov $t2,$s[1]\n" if ($i==3);
++ $code.=" mov $t1,$s[2]\n" if ($i==3);
++ $code.=" mov $t0,$s[3]\n" if ($i==3);
++ $code.="\n";
++}
++
++sub declast()
++{ my ($i,@s)=@_;
++ my $tmp0=$acc0;
++ my $tmp1=$acc1;
++ my $tmp2=$acc2;
++ my $out=($t0,$t1,$t2,$s[0])[$i];
++
++ $code.=" mov $s[0],$out\n" if ($i!=3);
++ $tmp1=$s[2] if ($i==3);
++ $code.=" mov $s[2],$tmp1\n" if ($i!=3);
++ $code.=" and \$0xFF,$out\n";
++
++ $code.=" movzb 2048($sbox,$out,1),$out\n";
++ $code.=" shr \$16,$tmp1\n";
++ $tmp2=$s[3] if ($i==3);
++ $code.=" mov $s[3],$tmp2\n" if ($i!=3);
++
++ $tmp0=$s[1] if ($i==3);
++ $code.=" movzb ".&hi($s[1]).",$tmp0\n";
++ $code.=" and \$0xFF,$tmp1\n";
++ $code.=" shr \$24,$tmp2\n";
++
++ $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
++ $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
++ $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
++
++ $code.=" shl \$8,$tmp0\n";
++ $code.=" shl \$16,$tmp1\n";
++ $code.=" shl \$24,$tmp2\n";
++
++ $code.=" xor $tmp0,$out\n";
++ $code.=" mov $t2,$s[1]\n" if ($i==3);
++ $code.=" xor $tmp1,$out\n";
++ $code.=" mov $t1,$s[2]\n" if ($i==3);
++ $code.=" xor $tmp2,$out\n";
++ $code.=" mov $t0,$s[3]\n" if ($i==3);
++ $code.="\n";
++}
++
++$code.=<<___;
++.type _x86_64_AES_decrypt,\@abi-omnipotent
++.align 16
++_x86_64_AES_decrypt:
++ xor 0($key),$s0 # xor with key
++ xor 4($key),$s1
++ xor 8($key),$s2
++ xor 12($key),$s3
++
++ mov 240($key),$rnds # load key->rounds
++ sub \$1,$rnds
++ jmp .Ldec_loop
++.align 16
++.Ldec_loop:
++___
++ if ($verticalspin) { &decvert(); }
++ else { &decstep(0,$s0,$s3,$s2,$s1);
++ &decstep(1,$s1,$s0,$s3,$s2);
++ &decstep(2,$s2,$s1,$s0,$s3);
++ &decstep(3,$s3,$s2,$s1,$s0);
++ $code.=<<___;
++ lea 16($key),$key
++ xor 0($key),$s0 # xor with key
++ xor 4($key),$s1
++ xor 8($key),$s2
++ xor 12($key),$s3
++___
++ }
++$code.=<<___;
++ sub \$1,$rnds
++ jnz .Ldec_loop
++___
++ if ($verticalspin) { &declastvert(); }
++ else { &declast(0,$s0,$s3,$s2,$s1);
++ &declast(1,$s1,$s0,$s3,$s2);
++ &declast(2,$s2,$s1,$s0,$s3);
++ &declast(3,$s3,$s2,$s1,$s0);
++ $code.=<<___;
++ xor 16+0($key),$s0 # xor with key
++ xor 16+4($key),$s1
++ xor 16+8($key),$s2
++ xor 16+12($key),$s3
++___
++ }
++$code.=<<___;
++ .byte 0xf3,0xc3 # rep ret
++.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
++___
++
++sub deccompactvert()
++{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
++
++$code.=<<___;
++ movzb `&lo("$s0")`,$t0
++ movzb `&lo("$s1")`,$t1
++ movzb `&lo("$s2")`,$t2
++ movzb `&lo("$s3")`,$t3
++ movzb `&hi("$s3")`,$acc0
++ movzb `&hi("$s0")`,$acc1
++ shr \$16,$s3
++ movzb `&hi("$s1")`,$acc2
++ movzb ($sbox,$t0,1),$t0
++ movzb ($sbox,$t1,1),$t1
++ movzb ($sbox,$t2,1),$t2
++ movzb ($sbox,$t3,1),$t3
++
++ movzb ($sbox,$acc0,1),$t4 #$t0
++ movzb `&hi("$s2")`,$acc0
++ movzb ($sbox,$acc1,1),$t5 #$t1
++ movzb ($sbox,$acc2,1),$acc2 #$t2
++ movzb ($sbox,$acc0,1),$acc0 #$t3
++
++ shr \$16,$s2
++ shl \$8,$t5
++ shl \$8,$t4
++ movzb `&lo("$s2")`,$acc1
++ shr \$16,$s0
++ xor $t4,$t0
++ shr \$16,$s1
++ movzb `&lo("$s3")`,$t4
++
++ shl \$8,$acc2
++ xor $t5,$t1
++ shl \$8,$acc0
++ movzb `&lo("$s0")`,$t5
++ movzb ($sbox,$acc1,1),$acc1 #$t0
++ xor $acc2,$t2
++ movzb `&lo("$s1")`,$acc2
++
++ shl \$16,$acc1
++ xor $acc0,$t3
++ movzb ($sbox,$t4,1),$t4 #$t1
++ movzb `&hi("$s1")`,$acc0
++ movzb ($sbox,$acc2,1),$acc2 #$t3
++ xor $acc1,$t0
++ movzb ($sbox,$t5,1),$t5 #$t2
++ movzb `&hi("$s2")`,$acc1
++
++ shl \$16,$acc2
++ shl \$16,$t4
++ shl \$16,$t5
++ xor $acc2,$t3
++ movzb `&hi("$s3")`,$acc2
++ xor $t4,$t1
++ shr \$8,$s0
++ xor $t5,$t2
++
++ movzb ($sbox,$acc0,1),$acc0 #$t0
++ movzb ($sbox,$acc1,1),$s1 #$t1
++ movzb ($sbox,$acc2,1),$s2 #$t2
++ movzb ($sbox,$s0,1),$s3 #$t3
++
++ mov $t0,$s0
++ shl \$24,$acc0
++ shl \$24,$s1
++ shl \$24,$s2
++ xor $acc0,$s0
++ shl \$24,$s3
++ xor $t1,$s1
++ xor $t2,$s2
++ xor $t3,$s3
++___
++}
++
++# parallelized version! input is pair of 64-bit values: %rax=s1.s0
++# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
++# %ecx=s2 and %edx=s3.
++sub dectransform()
++{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
++ my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
++ my $prefetch = shift;
++
++$code.=<<___;
++ mov $mask80,$tp40
++ mov $mask80,$tp48
++ and $tp10,$tp40
++ and $tp18,$tp48
++ mov $tp40,$acc0
++ mov $tp48,$acc8
++ shr \$7,$tp40
++ lea ($tp10,$tp10),$tp20
++ shr \$7,$tp48
++ lea ($tp18,$tp18),$tp28
++ sub $tp40,$acc0
++ sub $tp48,$acc8
++ and $maskfe,$tp20
++ and $maskfe,$tp28
++ and $mask1b,$acc0
++ and $mask1b,$acc8
++ xor $acc0,$tp20
++ xor $acc8,$tp28
++ mov $mask80,$tp80
++ mov $mask80,$tp88
++
++ and $tp20,$tp80
++ and $tp28,$tp88
++ mov $tp80,$acc0
++ mov $tp88,$acc8
++ shr \$7,$tp80
++ lea ($tp20,$tp20),$tp40
++ shr \$7,$tp88
++ lea ($tp28,$tp28),$tp48
++ sub $tp80,$acc0
++ sub $tp88,$acc8
++ and $maskfe,$tp40
++ and $maskfe,$tp48
++ and $mask1b,$acc0
++ and $mask1b,$acc8
++ xor $acc0,$tp40
++ xor $acc8,$tp48
++ mov $mask80,$tp80
++ mov $mask80,$tp88
++
++ and $tp40,$tp80
++ and $tp48,$tp88
++ mov $tp80,$acc0
++ mov $tp88,$acc8
++ shr \$7,$tp80
++ xor $tp10,$tp20 # tp2^=tp1
++ shr \$7,$tp88
++ xor $tp18,$tp28 # tp2^=tp1
++ sub $tp80,$acc0
++ sub $tp88,$acc8
++ lea ($tp40,$tp40),$tp80
++ lea ($tp48,$tp48),$tp88
++ xor $tp10,$tp40 # tp4^=tp1
++ xor $tp18,$tp48 # tp4^=tp1
++ and $maskfe,$tp80
++ and $maskfe,$tp88
++ and $mask1b,$acc0
++ and $mask1b,$acc8
++ xor $acc0,$tp80
++ xor $acc8,$tp88
++
++ xor $tp80,$tp10 # tp1^=tp8
++ xor $tp88,$tp18 # tp1^=tp8
++ xor $tp80,$tp20 # tp2^tp1^=tp8
++ xor $tp88,$tp28 # tp2^tp1^=tp8
++ mov $tp10,$acc0
++ mov $tp18,$acc8
++ xor $tp80,$tp40 # tp4^tp1^=tp8
++ shr \$32,$acc0
++ xor $tp88,$tp48 # tp4^tp1^=tp8
++ shr \$32,$acc8
++ xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
++ rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
++ xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
++ rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
++ xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
++ rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
++ xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
++
++ rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
++ xor `&LO("$tp80")`,`&LO("$tp10")`
++ shr \$32,$tp80
++ xor `&LO("$tp88")`,`&LO("$tp18")`
++ shr \$32,$tp88
++ xor `&LO("$tp80")`,`&LO("$acc0")`
++ xor `&LO("$tp88")`,`&LO("$acc8")`
++
++ mov $tp20,$tp80
++ rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
++ mov $tp28,$tp88
++ rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
++ shr \$32,$tp80
++ xor `&LO("$tp20")`,`&LO("$tp10")`
++ shr \$32,$tp88
++ xor `&LO("$tp28")`,`&LO("$tp18")`
++ rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
++ mov $tp40,$tp20
++ rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
++ mov $tp48,$tp28
++ shr \$32,$tp20
++ xor `&LO("$tp80")`,`&LO("$acc0")`
++ shr \$32,$tp28
++ xor `&LO("$tp88")`,`&LO("$acc8")`
++
++ `"mov 0($sbox),$mask80" if ($prefetch)`
++ rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
++ `"mov 64($sbox),$maskfe" if ($prefetch)`
++ rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
++ `"mov 128($sbox),$mask1b" if ($prefetch)`
++ rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
++ `"mov 192($sbox),$tp80" if ($prefetch)`
++ xor `&LO("$tp40")`,`&LO("$tp10")`
++ rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
++ xor `&LO("$tp48")`,`&LO("$tp18")`
++ `"mov 256($sbox),$tp88" if ($prefetch)`
++ xor `&LO("$tp20")`,`&LO("$acc0")`
++ xor `&LO("$tp28")`,`&LO("$acc8")`
++___
++}
++
++$code.=<<___;
++.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
++.align 16
++_x86_64_AES_decrypt_compact:
++.cfi_startproc
++ lea 128($sbox),$inp # size optimization
++ mov 0-128($inp),$acc1 # prefetch Td4
++ mov 32-128($inp),$acc2
++ mov 64-128($inp),$t0
++ mov 96-128($inp),$t1
++ mov 128-128($inp),$acc1
++ mov 160-128($inp),$acc2
++ mov 192-128($inp),$t0
++ mov 224-128($inp),$t1
++ jmp .Ldec_loop_compact
++
++.align 16
++.Ldec_loop_compact:
++ xor 0($key),$s0 # xor with key
++ xor 4($key),$s1
++ xor 8($key),$s2
++ xor 12($key),$s3
++ lea 16($key),$key
++___
++ &deccompactvert();
++$code.=<<___;
++ cmp 16(%rsp),$key
++ je .Ldec_compact_done
++
++ mov 256+0($sbox),$mask80
++ shl \$32,%rbx
++ shl \$32,%rdx
++ mov 256+8($sbox),$maskfe
++ or %rbx,%rax
++ or %rdx,%rcx
++ mov 256+16($sbox),$mask1b
++___
++ &dectransform(1);
++$code.=<<___;
++ jmp .Ldec_loop_compact
++.align 16
++.Ldec_compact_done:
++ xor 0($key),$s0
++ xor 4($key),$s1
++ xor 8($key),$s2
++ xor 12($key),$s3
++ .byte 0xf3,0xc3 # rep ret
++.cfi_endproc
++.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
++___
++
++# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
++$code.=<<___;
++.globl AES_decrypt
++.type AES_decrypt,\@function,3
++.align 16
++.globl asm_AES_decrypt
++.hidden asm_AES_decrypt
++asm_AES_decrypt:
++AES_decrypt:
++.cfi_startproc
++ mov %rsp,%rax
++.cfi_def_cfa_register %rax
++ push %rbx
++.cfi_push %rbx
++ push %rbp
++.cfi_push %rbp
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++
++ # allocate frame "above" key schedule
++ lea -63(%rdx),%rcx # %rdx is key argument
++ and \$-64,%rsp
++ sub %rsp,%rcx
++ neg %rcx
++ and \$0x3c0,%rcx
++ sub %rcx,%rsp
++ sub \$32,%rsp
++
++ mov %rsi,16(%rsp) # save out
++ mov %rax,24(%rsp) # save original stack pointer
++.cfi_cfa_expression %rsp+24,deref,+8
++.Ldec_prologue:
++
++ mov %rdx,$key
++ mov 240($key),$rnds # load rounds
++
++ mov 0(%rdi),$s0 # load input vector
++ mov 4(%rdi),$s1
++ mov 8(%rdi),$s2
++ mov 12(%rdi),$s3
++
++ shl \$4,$rnds
++ lea ($key,$rnds),%rbp
++ mov $key,(%rsp) # key schedule
++ mov %rbp,8(%rsp) # end of key schedule
++
++ # pick Td4 copy which can't "overlap" with stack frame or key schedule
++ lea .LAES_Td+2048(%rip),$sbox
++ lea 768(%rsp),%rbp
++ sub $sbox,%rbp
++ and \$0x300,%rbp
++ lea ($sbox,%rbp),$sbox
++ shr \$3,%rbp # recall "magic" constants!
++ add %rbp,$sbox
++
++ call _x86_64_AES_decrypt_compact
++
++ mov 16(%rsp),$out # restore out
++ mov 24(%rsp),%rsi # restore saved stack pointer
++.cfi_def_cfa %rsi,8
++ mov $s0,0($out) # write output vector
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ mov -48(%rsi),%r15
++.cfi_restore %r15
++ mov -40(%rsi),%r14
++.cfi_restore %r14
++ mov -32(%rsi),%r13
++.cfi_restore %r13
++ mov -24(%rsi),%r12
++.cfi_restore %r12
++ mov -16(%rsi),%rbp
++.cfi_restore %rbp
++ mov -8(%rsi),%rbx
++.cfi_restore %rbx
++ lea (%rsi),%rsp
++.cfi_def_cfa_register %rsp
++.Ldec_epilogue:
++ ret
++.cfi_endproc
++.size AES_decrypt,.-AES_decrypt
++___
++#------------------------------------------------------------------#
++
++sub enckey()
++{
++$code.=<<___;
++ movz %dl,%esi # rk[i]>>0
++ movzb -128(%rbp,%rsi),%ebx
++ movz %dh,%esi # rk[i]>>8
++ shl \$24,%ebx
++ xor %ebx,%eax
++
++ movzb -128(%rbp,%rsi),%ebx
++ shr \$16,%edx
++ movz %dl,%esi # rk[i]>>16
++ xor %ebx,%eax
++
++ movzb -128(%rbp,%rsi),%ebx
++ movz %dh,%esi # rk[i]>>24
++ shl \$8,%ebx
++ xor %ebx,%eax
++
++ movzb -128(%rbp,%rsi),%ebx
++ shl \$16,%ebx
++ xor %ebx,%eax
++
++ xor 1024-128(%rbp,%rcx,4),%eax # rcon
++___
++}
++
++# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
++# AES_KEY *key)
++$code.=<<___;
++.globl AES_set_encrypt_key
++.type AES_set_encrypt_key,\@function,3
++.align 16
++AES_set_encrypt_key:
++.cfi_startproc
++ push %rbx
++.cfi_push %rbx
++ push %rbp
++.cfi_push %rbp
++ push %r12 # redundant, but allows to share
++.cfi_push %r12
++ push %r13 # exception handler...
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ sub \$8,%rsp
++.cfi_adjust_cfa_offset 8
++.Lenc_key_prologue:
++
++ call _x86_64_AES_set_encrypt_key
++
++ mov 40(%rsp),%rbp
++.cfi_restore %rbp
++ mov 48(%rsp),%rbx
++.cfi_restore %rbx
++ add \$56,%rsp
++.cfi_adjust_cfa_offset -56
++.Lenc_key_epilogue:
++ ret
++.cfi_endproc
++.size AES_set_encrypt_key,.-AES_set_encrypt_key
++
++.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
++.align 16
++_x86_64_AES_set_encrypt_key:
++.cfi_startproc
++ mov %esi,%ecx # %ecx=bits
++ mov %rdi,%rsi # %rsi=userKey
++ mov %rdx,%rdi # %rdi=key
++
++ test \$-1,%rsi
++ jz .Lbadpointer
++ test \$-1,%rdi
++ jz .Lbadpointer
++
++ lea .LAES_Te(%rip),%rbp
++ lea 2048+128(%rbp),%rbp
++
++ # prefetch Te4
++ mov 0-128(%rbp),%eax
++ mov 32-128(%rbp),%ebx
++ mov 64-128(%rbp),%r8d
++ mov 96-128(%rbp),%edx
++ mov 128-128(%rbp),%eax
++ mov 160-128(%rbp),%ebx
++ mov 192-128(%rbp),%r8d
++ mov 224-128(%rbp),%edx
++
++ cmp \$128,%ecx
++ je .L10rounds
++ cmp \$192,%ecx
++ je .L12rounds
++ cmp \$256,%ecx
++ je .L14rounds
++ mov \$-2,%rax # invalid number of bits
++ jmp .Lexit
++
++.L10rounds:
++ mov 0(%rsi),%rax # copy first 4 dwords
++ mov 8(%rsi),%rdx
++ mov %rax,0(%rdi)
++ mov %rdx,8(%rdi)
++
++ shr \$32,%rdx
++ xor %ecx,%ecx
++ jmp .L10shortcut
++.align 4
++.L10loop:
++ mov 0(%rdi),%eax # rk[0]
++ mov 12(%rdi),%edx # rk[3]
++.L10shortcut:
++___
++ &enckey ();
++$code.=<<___;
++ mov %eax,16(%rdi) # rk[4]
++ xor 4(%rdi),%eax
++ mov %eax,20(%rdi) # rk[5]
++ xor 8(%rdi),%eax
++ mov %eax,24(%rdi) # rk[6]
++ xor 12(%rdi),%eax
++ mov %eax,28(%rdi) # rk[7]
++ add \$1,%ecx
++ lea 16(%rdi),%rdi
++ cmp \$10,%ecx
++ jl .L10loop
++
++ movl \$10,80(%rdi) # setup number of rounds
++ xor %rax,%rax
++ jmp .Lexit
++
++.L12rounds:
++ mov 0(%rsi),%rax # copy first 6 dwords
++ mov 8(%rsi),%rbx
++ mov 16(%rsi),%rdx
++ mov %rax,0(%rdi)
++ mov %rbx,8(%rdi)
++ mov %rdx,16(%rdi)
++
++ shr \$32,%rdx
++ xor %ecx,%ecx
++ jmp .L12shortcut
++.align 4
++.L12loop:
++ mov 0(%rdi),%eax # rk[0]
++ mov 20(%rdi),%edx # rk[5]
++.L12shortcut:
++___
++ &enckey ();
++$code.=<<___;
++ mov %eax,24(%rdi) # rk[6]
++ xor 4(%rdi),%eax
++ mov %eax,28(%rdi) # rk[7]
++ xor 8(%rdi),%eax
++ mov %eax,32(%rdi) # rk[8]
++ xor 12(%rdi),%eax
++ mov %eax,36(%rdi) # rk[9]
++
++ cmp \$7,%ecx
++ je .L12break
++ add \$1,%ecx
++
++ xor 16(%rdi),%eax
++ mov %eax,40(%rdi) # rk[10]
++ xor 20(%rdi),%eax
++ mov %eax,44(%rdi) # rk[11]
++
++ lea 24(%rdi),%rdi
++ jmp .L12loop
++.L12break:
++ movl \$12,72(%rdi) # setup number of rounds
++ xor %rax,%rax
++ jmp .Lexit
++
++.L14rounds:
++ mov 0(%rsi),%rax # copy first 8 dwords
++ mov 8(%rsi),%rbx
++ mov 16(%rsi),%rcx
++ mov 24(%rsi),%rdx
++ mov %rax,0(%rdi)
++ mov %rbx,8(%rdi)
++ mov %rcx,16(%rdi)
++ mov %rdx,24(%rdi)
++
++ shr \$32,%rdx
++ xor %ecx,%ecx
++ jmp .L14shortcut
++.align 4
++.L14loop:
++ mov 0(%rdi),%eax # rk[0]
++ mov 28(%rdi),%edx # rk[4]
++.L14shortcut:
++___
++ &enckey ();
++$code.=<<___;
++ mov %eax,32(%rdi) # rk[8]
++ xor 4(%rdi),%eax
++ mov %eax,36(%rdi) # rk[9]
++ xor 8(%rdi),%eax
++ mov %eax,40(%rdi) # rk[10]
++ xor 12(%rdi),%eax
++ mov %eax,44(%rdi) # rk[11]
++
++ cmp \$6,%ecx
++ je .L14break
++ add \$1,%ecx
++
++ mov %eax,%edx
++ mov 16(%rdi),%eax # rk[4]
++ movz %dl,%esi # rk[11]>>0
++ movzb -128(%rbp,%rsi),%ebx
++ movz %dh,%esi # rk[11]>>8
++ xor %ebx,%eax
++
++ movzb -128(%rbp,%rsi),%ebx
++ shr \$16,%edx
++ shl \$8,%ebx
++ movz %dl,%esi # rk[11]>>16
++ xor %ebx,%eax
++
++ movzb -128(%rbp,%rsi),%ebx
++ movz %dh,%esi # rk[11]>>24
++ shl \$16,%ebx
++ xor %ebx,%eax
++
++ movzb -128(%rbp,%rsi),%ebx
++ shl \$24,%ebx
++ xor %ebx,%eax
++
++ mov %eax,48(%rdi) # rk[12]
++ xor 20(%rdi),%eax
++ mov %eax,52(%rdi) # rk[13]
++ xor 24(%rdi),%eax
++ mov %eax,56(%rdi) # rk[14]
++ xor 28(%rdi),%eax
++ mov %eax,60(%rdi) # rk[15]
++
++ lea 32(%rdi),%rdi
++ jmp .L14loop
++.L14break:
++ movl \$14,48(%rdi) # setup number of rounds
++ xor %rax,%rax
++ jmp .Lexit
++
++.Lbadpointer:
++ mov \$-1,%rax
++.Lexit:
++ .byte 0xf3,0xc3 # rep ret
++.cfi_endproc
++.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
++___
++
++sub deckey_ref()
++{ my ($i,$ptr,$te,$td) = @_;
++ my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
++$code.=<<___;
++ mov $i($ptr),$tp1
++ mov $tp1,$acc
++ and \$0x80808080,$acc
++ mov $acc,$tp4
++ shr \$7,$tp4
++ lea 0($tp1,$tp1),$tp2
++ sub $tp4,$acc
++ and \$0xfefefefe,$tp2
++ and \$0x1b1b1b1b,$acc
++ xor $tp2,$acc
++ mov $acc,$tp2
++
++ and \$0x80808080,$acc
++ mov $acc,$tp8
++ shr \$7,$tp8
++ lea 0($tp2,$tp2),$tp4
++ sub $tp8,$acc
++ and \$0xfefefefe,$tp4
++ and \$0x1b1b1b1b,$acc
++ xor $tp1,$tp2 # tp2^tp1
++ xor $tp4,$acc
++ mov $acc,$tp4
++
++ and \$0x80808080,$acc
++ mov $acc,$tp8
++ shr \$7,$tp8
++ sub $tp8,$acc
++ lea 0($tp4,$tp4),$tp8
++ xor $tp1,$tp4 # tp4^tp1
++ and \$0xfefefefe,$tp8
++ and \$0x1b1b1b1b,$acc
++ xor $acc,$tp8
++
++ xor $tp8,$tp1 # tp1^tp8
++ rol \$8,$tp1 # ROTATE(tp1^tp8,8)
++ xor $tp8,$tp2 # tp2^tp1^tp8
++ xor $tp8,$tp4 # tp4^tp1^tp8
++ xor $tp2,$tp8
++ xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
++
++ xor $tp8,$tp1
++ rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
++ xor $tp2,$tp1
++ rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
++ xor $tp4,$tp1
++
++ mov $tp1,$i($ptr)
++___
++}
++
++# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
++# AES_KEY *key)
++$code.=<<___;
++.globl AES_set_decrypt_key
++.type AES_set_decrypt_key,\@function,3
++.align 16
++AES_set_decrypt_key:
++.cfi_startproc
++ push %rbx
++.cfi_push %rbx
++ push %rbp
++.cfi_push %rbp
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ push %rdx # save key schedule
++.cfi_adjust_cfa_offset 8
++.Ldec_key_prologue:
++
++ call _x86_64_AES_set_encrypt_key
++ mov (%rsp),%r8 # restore key schedule
++ cmp \$0,%eax
++ jne .Labort
++
++ mov 240(%r8),%r14d # pull number of rounds
++ xor %rdi,%rdi
++ lea (%rdi,%r14d,4),%rcx
++ mov %r8,%rsi
++ lea (%r8,%rcx,4),%rdi # pointer to last chunk
++.align 4
++.Linvert:
++ mov 0(%rsi),%rax
++ mov 8(%rsi),%rbx
++ mov 0(%rdi),%rcx
++ mov 8(%rdi),%rdx
++ mov %rax,0(%rdi)
++ mov %rbx,8(%rdi)
++ mov %rcx,0(%rsi)
++ mov %rdx,8(%rsi)
++ lea 16(%rsi),%rsi
++ lea -16(%rdi),%rdi
++ cmp %rsi,%rdi
++ jne .Linvert
++
++ lea .LAES_Te+2048+1024(%rip),%rax # rcon
++
++ mov 40(%rax),$mask80
++ mov 48(%rax),$maskfe
++ mov 56(%rax),$mask1b
++
++ mov %r8,$key
++ sub \$1,%r14d
++.align 4
++.Lpermute:
++ lea 16($key),$key
++ mov 0($key),%rax
++ mov 8($key),%rcx
++___
++ &dectransform ();
++$code.=<<___;
++ mov %eax,0($key)
++ mov %ebx,4($key)
++ mov %ecx,8($key)
++ mov %edx,12($key)
++ sub \$1,%r14d
++ jnz .Lpermute
++
++ xor %rax,%rax
++.Labort:
++ mov 8(%rsp),%r15
++.cfi_restore %r15
++ mov 16(%rsp),%r14
++.cfi_restore %r14
++ mov 24(%rsp),%r13
++.cfi_restore %r13
++ mov 32(%rsp),%r12
++.cfi_restore %r12
++ mov 40(%rsp),%rbp
++.cfi_restore %rbp
++ mov 48(%rsp),%rbx
++.cfi_restore %rbx
++ add \$56,%rsp
++.cfi_adjust_cfa_offset -56
++.Ldec_key_epilogue:
++ ret
++.cfi_endproc
++.size AES_set_decrypt_key,.-AES_set_decrypt_key
++___
++
++# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
++# size_t length, const AES_KEY *key,
++# unsigned char *ivp,const int enc);
++{
++# stack frame layout
++# -8(%rsp) return address
++my $keyp="0(%rsp)"; # one to pass as $key
++my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
++my $_rsp="16(%rsp)"; # saved %rsp
++my $_inp="24(%rsp)"; # copy of 1st parameter, inp
++my $_out="32(%rsp)"; # copy of 2nd parameter, out
++my $_len="40(%rsp)"; # copy of 3rd parameter, length
++my $_key="48(%rsp)"; # copy of 4th parameter, key
++my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
++my $ivec="64(%rsp)"; # ivec[16]
++my $aes_key="80(%rsp)"; # copy of aes_key
++my $mark="80+240(%rsp)"; # copy of aes_key->rounds
++
++$code.=<<___;
++.globl AES_cbc_encrypt
++.type AES_cbc_encrypt,\@function,6
++.align 16
++.extern OPENSSL_ia32cap_P
++.globl asm_AES_cbc_encrypt
++.hidden asm_AES_cbc_encrypt
++asm_AES_cbc_encrypt:
++AES_cbc_encrypt:
++.cfi_startproc
++ cmp \$0,%rdx # check length
++ je .Lcbc_epilogue
++ pushfq
++# This could be .cfi_push 49, but libunwind fails on registers it does not
++# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
++.cfi_adjust_cfa_offset 8
++ push %rbx
++.cfi_push %rbx
++ push %rbp
++.cfi_push %rbp
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++.Lcbc_prologue:
++
++ cld
++ mov %r9d,%r9d # clear upper half of enc
++
++ lea .LAES_Te(%rip),$sbox
++ lea .LAES_Td(%rip),%r10
++ cmp \$0,%r9
++ cmoveq %r10,$sbox
++
++.cfi_remember_state
++ mov OPENSSL_ia32cap_P(%rip),%r10d
++ cmp \$$speed_limit,%rdx
++ jb .Lcbc_slow_prologue
++ test \$15,%rdx
++ jnz .Lcbc_slow_prologue
++ bt \$28,%r10d
++ jc .Lcbc_slow_prologue
++
++ # allocate aligned stack frame...
++ lea -88-248(%rsp),$key
++ and \$-64,$key
++
++ # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
++ mov $sbox,%r10
++ lea 2304($sbox),%r11
++ mov $key,%r12
++ and \$0xFFF,%r10 # s = $sbox&0xfff
++ and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
++ and \$0xFFF,%r12 # p = %rsp&0xfff
++
++ cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
++ jb .Lcbc_te_break_out
++ sub %r11,%r12
++ sub %r12,$key
++ jmp .Lcbc_te_ok
++.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz
++ sub %r10,%r12
++ and \$0xFFF,%r12
++ add \$320,%r12
++ sub %r12,$key
++.align 4
++.Lcbc_te_ok:
++
++ xchg %rsp,$key
++.cfi_def_cfa_register $key
++ #add \$8,%rsp # reserve for return address!
++ mov $key,$_rsp # save %rsp
++.cfi_cfa_expression $_rsp,deref,+64
++.Lcbc_fast_body:
++ mov %rdi,$_inp # save copy of inp
++ mov %rsi,$_out # save copy of out
++ mov %rdx,$_len # save copy of len
++ mov %rcx,$_key # save copy of key
++ mov %r8,$_ivp # save copy of ivp
++ movl \$0,$mark # copy of aes_key->rounds = 0;
++ mov %r8,%rbp # rearrange input arguments
++ mov %r9,%rbx
++ mov %rsi,$out
++ mov %rdi,$inp
++ mov %rcx,$key
++
++ mov 240($key),%eax # key->rounds
++ # do we copy key schedule to stack?
++ mov $key,%r10
++ sub $sbox,%r10
++ and \$0xfff,%r10
++ cmp \$2304,%r10
++ jb .Lcbc_do_ecopy
++ cmp \$4096-248,%r10
++ jb .Lcbc_skip_ecopy
++.align 4
++.Lcbc_do_ecopy:
++ mov $key,%rsi
++ lea $aes_key,%rdi
++ lea $aes_key,$key
++ mov \$240/8,%ecx
++ .long 0x90A548F3 # rep movsq
++ mov %eax,(%rdi) # copy aes_key->rounds
++.Lcbc_skip_ecopy:
++ mov $key,$keyp # save key pointer
++
++ mov \$18,%ecx
++.align 4
++.Lcbc_prefetch_te:
++ mov 0($sbox),%r10
++ mov 32($sbox),%r11
++ mov 64($sbox),%r12
++ mov 96($sbox),%r13
++ lea 128($sbox),$sbox
++ sub \$1,%ecx
++ jnz .Lcbc_prefetch_te
++ lea -2304($sbox),$sbox
++
++ cmp \$0,%rbx
++ je .LFAST_DECRYPT
++
++#----------------------------- ENCRYPT -----------------------------#
++ mov 0(%rbp),$s0 # load iv
++ mov 4(%rbp),$s1
++ mov 8(%rbp),$s2
++ mov 12(%rbp),$s3
++
++.align 4
++.Lcbc_fast_enc_loop:
++ xor 0($inp),$s0
++ xor 4($inp),$s1
++ xor 8($inp),$s2
++ xor 12($inp),$s3
++ mov $keyp,$key # restore key
++ mov $inp,$_inp # if ($verticalspin) save inp
++
++ call _x86_64_AES_encrypt
++
++ mov $_inp,$inp # if ($verticalspin) restore inp
++ mov $_len,%r10
++ mov $s0,0($out)
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ lea 16($inp),$inp
++ lea 16($out),$out
++ sub \$16,%r10
++ test \$-16,%r10
++ mov %r10,$_len
++ jnz .Lcbc_fast_enc_loop
++ mov $_ivp,%rbp # restore ivp
++ mov $s0,0(%rbp) # save ivec
++ mov $s1,4(%rbp)
++ mov $s2,8(%rbp)
++ mov $s3,12(%rbp)
++
++ jmp .Lcbc_fast_cleanup
++
++#----------------------------- DECRYPT -----------------------------#
++.align 16
++.LFAST_DECRYPT:
++ cmp $inp,$out
++ je .Lcbc_fast_dec_in_place
++
++ mov %rbp,$ivec
++.align 4
++.Lcbc_fast_dec_loop:
++ mov 0($inp),$s0 # read input
++ mov 4($inp),$s1
++ mov 8($inp),$s2
++ mov 12($inp),$s3
++ mov $keyp,$key # restore key
++ mov $inp,$_inp # if ($verticalspin) save inp
++
++ call _x86_64_AES_decrypt
++
++ mov $ivec,%rbp # load ivp
++ mov $_inp,$inp # if ($verticalspin) restore inp
++ mov $_len,%r10 # load len
++ xor 0(%rbp),$s0 # xor iv
++ xor 4(%rbp),$s1
++ xor 8(%rbp),$s2
++ xor 12(%rbp),$s3
++ mov $inp,%rbp # current input, next iv
++
++ sub \$16,%r10
++ mov %r10,$_len # update len
++ mov %rbp,$ivec # update ivp
++
++ mov $s0,0($out) # write output
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ lea 16($inp),$inp
++ lea 16($out),$out
++ jnz .Lcbc_fast_dec_loop
++ mov $_ivp,%r12 # load user ivp
++ mov 0(%rbp),%r10 # load iv
++ mov 8(%rbp),%r11
++ mov %r10,0(%r12) # copy back to user
++ mov %r11,8(%r12)
++ jmp .Lcbc_fast_cleanup
++
++.align 16
++.Lcbc_fast_dec_in_place:
++ mov 0(%rbp),%r10 # copy iv to stack
++ mov 8(%rbp),%r11
++ mov %r10,0+$ivec
++ mov %r11,8+$ivec
++.align 4
++.Lcbc_fast_dec_in_place_loop:
++ mov 0($inp),$s0 # load input
++ mov 4($inp),$s1
++ mov 8($inp),$s2
++ mov 12($inp),$s3
++ mov $keyp,$key # restore key
++ mov $inp,$_inp # if ($verticalspin) save inp
++
++ call _x86_64_AES_decrypt
++
++ mov $_inp,$inp # if ($verticalspin) restore inp
++ mov $_len,%r10
++ xor 0+$ivec,$s0
++ xor 4+$ivec,$s1
++ xor 8+$ivec,$s2
++ xor 12+$ivec,$s3
++
++ mov 0($inp),%r11 # load input
++ mov 8($inp),%r12
++ sub \$16,%r10
++ jz .Lcbc_fast_dec_in_place_done
++
++ mov %r11,0+$ivec # copy input to iv
++ mov %r12,8+$ivec
++
++ mov $s0,0($out) # save output [zaps input]
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ lea 16($inp),$inp
++ lea 16($out),$out
++ mov %r10,$_len
++ jmp .Lcbc_fast_dec_in_place_loop
++.Lcbc_fast_dec_in_place_done:
++ mov $_ivp,%rdi
++ mov %r11,0(%rdi) # copy iv back to user
++ mov %r12,8(%rdi)
++
++ mov $s0,0($out) # save output [zaps input]
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++.align 4
++.Lcbc_fast_cleanup:
++ cmpl \$0,$mark # was the key schedule copied?
++ lea $aes_key,%rdi
++ je .Lcbc_exit
++ mov \$240/8,%ecx
++ xor %rax,%rax
++ .long 0x90AB48F3 # rep stosq
++
++ jmp .Lcbc_exit
++
++#--------------------------- SLOW ROUTINE ---------------------------#
++.align 16
++.Lcbc_slow_prologue:
++.cfi_restore_state
++ # allocate aligned stack frame...
++ lea -88(%rsp),%rbp
++ and \$-64,%rbp
++ # ... just "above" key schedule
++ lea -88-63(%rcx),%r10
++ sub %rbp,%r10
++ neg %r10
++ and \$0x3c0,%r10
++ sub %r10,%rbp
++
++ xchg %rsp,%rbp
++.cfi_def_cfa_register %rbp
++ #add \$8,%rsp # reserve for return address!
++ mov %rbp,$_rsp # save %rsp
++.cfi_cfa_expression $_rsp,deref,+64
++.Lcbc_slow_body:
++ #mov %rdi,$_inp # save copy of inp
++ #mov %rsi,$_out # save copy of out
++ #mov %rdx,$_len # save copy of len
++ #mov %rcx,$_key # save copy of key
++ mov %r8,$_ivp # save copy of ivp
++ mov %r8,%rbp # rearrange input arguments
++ mov %r9,%rbx
++ mov %rsi,$out
++ mov %rdi,$inp
++ mov %rcx,$key
++ mov %rdx,%r10
++
++ mov 240($key),%eax
++ mov $key,$keyp # save key pointer
++ shl \$4,%eax
++ lea ($key,%rax),%rax
++ mov %rax,$keyend
++
++ # pick Te4 copy which can't "overlap" with stack frame or key schedule
++ lea 2048($sbox),$sbox
++ lea 768-8(%rsp),%rax
++ sub $sbox,%rax
++ and \$0x300,%rax
++ lea ($sbox,%rax),$sbox
++
++ cmp \$0,%rbx
++ je .LSLOW_DECRYPT
++
++#--------------------------- SLOW ENCRYPT ---------------------------#
++ test \$-16,%r10 # check upon length
++ mov 0(%rbp),$s0 # load iv
++ mov 4(%rbp),$s1
++ mov 8(%rbp),$s2
++ mov 12(%rbp),$s3
++ jz .Lcbc_slow_enc_tail # short input...
++
++.align 4
++.Lcbc_slow_enc_loop:
++ xor 0($inp),$s0
++ xor 4($inp),$s1
++ xor 8($inp),$s2
++ xor 12($inp),$s3
++ mov $keyp,$key # restore key
++ mov $inp,$_inp # save inp
++ mov $out,$_out # save out
++ mov %r10,$_len # save len
++
++ call _x86_64_AES_encrypt_compact
++
++ mov $_inp,$inp # restore inp
++ mov $_out,$out # restore out
++ mov $_len,%r10 # restore len
++ mov $s0,0($out)
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ lea 16($inp),$inp
++ lea 16($out),$out
++ sub \$16,%r10
++ test \$-16,%r10
++ jnz .Lcbc_slow_enc_loop
++ test \$15,%r10
++ jnz .Lcbc_slow_enc_tail
++ mov $_ivp,%rbp # restore ivp
++ mov $s0,0(%rbp) # save ivec
++ mov $s1,4(%rbp)
++ mov $s2,8(%rbp)
++ mov $s3,12(%rbp)
++
++ jmp .Lcbc_exit
++
++.align 4
++.Lcbc_slow_enc_tail:
++ mov %rax,%r11
++ mov %rcx,%r12
++ mov %r10,%rcx
++ mov $inp,%rsi
++ mov $out,%rdi
++ .long 0x9066A4F3 # rep movsb
++ mov \$16,%rcx # zero tail
++ sub %r10,%rcx
++ xor %rax,%rax
++ .long 0x9066AAF3 # rep stosb
++ mov $out,$inp # this is not a mistake!
++ mov \$16,%r10 # len=16
++ mov %r11,%rax
++ mov %r12,%rcx
++ jmp .Lcbc_slow_enc_loop # one more spin...
++#--------------------------- SLOW DECRYPT ---------------------------#
++.align 16
++.LSLOW_DECRYPT:
++ shr \$3,%rax
++ add %rax,$sbox # recall "magic" constants!
++
++ mov 0(%rbp),%r11 # copy iv to stack
++ mov 8(%rbp),%r12
++ mov %r11,0+$ivec
++ mov %r12,8+$ivec
++
++.align 4
++.Lcbc_slow_dec_loop:
++ mov 0($inp),$s0 # load input
++ mov 4($inp),$s1
++ mov 8($inp),$s2
++ mov 12($inp),$s3
++ mov $keyp,$key # restore key
++ mov $inp,$_inp # save inp
++ mov $out,$_out # save out
++ mov %r10,$_len # save len
++
++ call _x86_64_AES_decrypt_compact
++
++ mov $_inp,$inp # restore inp
++ mov $_out,$out # restore out
++ mov $_len,%r10
++ xor 0+$ivec,$s0
++ xor 4+$ivec,$s1
++ xor 8+$ivec,$s2
++ xor 12+$ivec,$s3
++
++ mov 0($inp),%r11 # load input
++ mov 8($inp),%r12
++ sub \$16,%r10
++ jc .Lcbc_slow_dec_partial
++ jz .Lcbc_slow_dec_done
++
++ mov %r11,0+$ivec # copy input to iv
++ mov %r12,8+$ivec
++
++ mov $s0,0($out) # save output [can zap input]
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ lea 16($inp),$inp
++ lea 16($out),$out
++ jmp .Lcbc_slow_dec_loop
++.Lcbc_slow_dec_done:
++ mov $_ivp,%rdi
++ mov %r11,0(%rdi) # copy iv back to user
++ mov %r12,8(%rdi)
++
++ mov $s0,0($out) # save output [can zap input]
++ mov $s1,4($out)
++ mov $s2,8($out)
++ mov $s3,12($out)
++
++ jmp .Lcbc_exit
++
++.align 4
++.Lcbc_slow_dec_partial:
++ mov $_ivp,%rdi
++ mov %r11,0(%rdi) # copy iv back to user
++ mov %r12,8(%rdi)
++
++ mov $s0,0+$ivec # save output to stack
++ mov $s1,4+$ivec
++ mov $s2,8+$ivec
++ mov $s3,12+$ivec
++
++ mov $out,%rdi
++ lea $ivec,%rsi
++ lea 16(%r10),%rcx
++ .long 0x9066A4F3 # rep movsb
++ jmp .Lcbc_exit
++
++.align 16
++.Lcbc_exit:
++ mov $_rsp,%rsi
++.cfi_def_cfa %rsi,64
++ mov (%rsi),%r15
++.cfi_restore %r15
++ mov 8(%rsi),%r14
++.cfi_restore %r14
++ mov 16(%rsi),%r13
++.cfi_restore %r13
++ mov 24(%rsi),%r12
++.cfi_restore %r12
++ mov 32(%rsi),%rbp
++.cfi_restore %rbp
++ mov 40(%rsi),%rbx
++.cfi_restore %rbx
++ lea 48(%rsi),%rsp
++.cfi_def_cfa %rsp,16
++.Lcbc_popfq:
++ popfq
++# This could be .cfi_pop 49, but libunwind fails on registers it does not
++# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
++.cfi_adjust_cfa_offset -8
++.Lcbc_epilogue:
++ ret
++.cfi_endproc
++.size AES_cbc_encrypt,.-AES_cbc_encrypt
++___
++}
++
++$code.=<<___;
++.align 64
++.LAES_Te:
++___
++ &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
++ &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
++ &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
++ &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
++ &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
++ &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
++ &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
++ &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
++ &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
++ &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
++ &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
++ &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
++ &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
++ &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
++ &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
++ &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
++ &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
++ &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
++ &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
++ &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
++ &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
++ &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
++ &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
++ &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
++ &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
++ &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
++ &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
++ &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
++ &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
++ &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
++ &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
++ &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
++ &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
++ &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
++ &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
++ &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
++ &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
++ &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
++ &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
++ &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
++ &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
++ &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
++ &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
++ &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
++ &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
++ &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
++ &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
++ &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
++ &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
++ &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
++ &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
++ &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
++ &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
++ &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
++ &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
++ &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
++ &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
++ &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
++ &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
++ &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
++ &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
++ &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
++ &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
++ &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
++
++#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++
++ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
++ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
++ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
++ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
++ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
++ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
++ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
++ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
++ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
++ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
++ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
++ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
++ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
++ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
++ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
++ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
++ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
++ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
++ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
++ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
++ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
++ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
++ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
++ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
++ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
++ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
++ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
++ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
++ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
++ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
++ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
++ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
++#rcon:
++$code.=<<___;
++ .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
++ .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
++ .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
++ .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
++___
++$code.=<<___;
++.align 64
++.LAES_Td:
++___
++ &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
++ &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
++ &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
++ &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
++ &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
++ &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
++ &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
++ &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
++ &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
++ &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
++ &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
++ &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
++ &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
++ &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
++ &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
++ &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
++ &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
++ &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
++ &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
++ &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
++ &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
++ &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
++ &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
++ &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
++ &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
++ &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
++ &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
++ &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
++ &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
++ &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
++ &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
++ &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
++ &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
++ &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
++ &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
++ &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
++ &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
++ &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
++ &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
++ &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
++ &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
++ &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
++ &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
++ &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
++ &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
++ &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
++ &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
++ &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
++ &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
++ &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
++ &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
++ &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
++ &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
++ &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
++ &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
++ &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
++ &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
++ &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
++ &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
++ &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
++ &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
++ &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
++ &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
++ &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
++
++#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++$code.=<<___;
++ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
++ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
++___
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++$code.=<<___;
++ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
++ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
++___
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++$code.=<<___;
++ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
++ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
++___
++ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
++ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
++ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
++ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
++ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
++ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
++ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
++ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
++ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
++ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
++ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
++ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
++ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
++ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
++ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
++ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
++ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
++ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
++ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
++ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
++ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
++ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
++ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
++ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
++ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
++ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
++ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
++ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
++ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
++ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
++ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
++ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
++$code.=<<___;
++ .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
++ .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
++.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
++.align 64
++___
++
++# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
++# CONTEXT *context,DISPATCHER_CONTEXT *disp)
++if ($win64) {
++$rec="%rcx";
++$frame="%rdx";
++$context="%r8";
++$disp="%r9";
++
++$code.=<<___;
++.extern __imp_RtlVirtualUnwind
++.type block_se_handler,\@abi-omnipotent
++.align 16
++block_se_handler:
++ push %rsi
++ push %rdi
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ push %r14
++ push %r15
++ pushfq
++ sub \$64,%rsp
++
++ mov 120($context),%rax # pull context->Rax
++ mov 248($context),%rbx # pull context->Rip
++
++ mov 8($disp),%rsi # disp->ImageBase
++ mov 56($disp),%r11 # disp->HandlerData
++
++ mov 0(%r11),%r10d # HandlerData[0]
++ lea (%rsi,%r10),%r10 # prologue label
++ cmp %r10,%rbx # context->Rip<prologue label
++ jb .Lin_block_prologue
++
++ mov 152($context),%rax # pull context->Rsp
++
++ mov 4(%r11),%r10d # HandlerData[1]
++ lea (%rsi,%r10),%r10 # epilogue label
++ cmp %r10,%rbx # context->Rip>=epilogue label
++ jae .Lin_block_prologue
++
++ mov 24(%rax),%rax # pull saved real stack pointer
++
++ mov -8(%rax),%rbx
++ mov -16(%rax),%rbp
++ mov -24(%rax),%r12
++ mov -32(%rax),%r13
++ mov -40(%rax),%r14
++ mov -48(%rax),%r15
++ mov %rbx,144($context) # restore context->Rbx
++ mov %rbp,160($context) # restore context->Rbp
++ mov %r12,216($context) # restore context->R12
++ mov %r13,224($context) # restore context->R13
++ mov %r14,232($context) # restore context->R14
++ mov %r15,240($context) # restore context->R15
++
++.Lin_block_prologue:
++ mov 8(%rax),%rdi
++ mov 16(%rax),%rsi
++ mov %rax,152($context) # restore context->Rsp
++ mov %rsi,168($context) # restore context->Rsi
++ mov %rdi,176($context) # restore context->Rdi
++
++ jmp .Lcommon_seh_exit
++.size block_se_handler,.-block_se_handler
++
++.type key_se_handler,\@abi-omnipotent
++.align 16
++key_se_handler:
++ push %rsi
++ push %rdi
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ push %r14
++ push %r15
++ pushfq
++ sub \$64,%rsp
++
++ mov 120($context),%rax # pull context->Rax
++ mov 248($context),%rbx # pull context->Rip
++
++ mov 8($disp),%rsi # disp->ImageBase
++ mov 56($disp),%r11 # disp->HandlerData
++
++ mov 0(%r11),%r10d # HandlerData[0]
++ lea (%rsi,%r10),%r10 # prologue label
++ cmp %r10,%rbx # context->Rip<prologue label
++ jb .Lin_key_prologue
++
++ mov 152($context),%rax # pull context->Rsp
++
++ mov 4(%r11),%r10d # HandlerData[1]
++ lea (%rsi,%r10),%r10 # epilogue label
++ cmp %r10,%rbx # context->Rip>=epilogue label
++ jae .Lin_key_prologue
++
++ lea 56(%rax),%rax
++
++ mov -8(%rax),%rbx
++ mov -16(%rax),%rbp
++ mov -24(%rax),%r12
++ mov -32(%rax),%r13
++ mov -40(%rax),%r14
++ mov -48(%rax),%r15
++ mov %rbx,144($context) # restore context->Rbx
++ mov %rbp,160($context) # restore context->Rbp
++ mov %r12,216($context) # restore context->R12
++ mov %r13,224($context) # restore context->R13
++ mov %r14,232($context) # restore context->R14
++ mov %r15,240($context) # restore context->R15
++
++.Lin_key_prologue:
++ mov 8(%rax),%rdi
++ mov 16(%rax),%rsi
++ mov %rax,152($context) # restore context->Rsp
++ mov %rsi,168($context) # restore context->Rsi
++ mov %rdi,176($context) # restore context->Rdi
++
++ jmp .Lcommon_seh_exit
++.size key_se_handler,.-key_se_handler
++
++.type cbc_se_handler,\@abi-omnipotent
++.align 16
++cbc_se_handler:
++ push %rsi
++ push %rdi
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ push %r14
++ push %r15
++ pushfq
++ sub \$64,%rsp
++
++ mov 120($context),%rax # pull context->Rax
++ mov 248($context),%rbx # pull context->Rip
++
++ lea .Lcbc_prologue(%rip),%r10
++ cmp %r10,%rbx # context->Rip<.Lcbc_prologue
++ jb .Lin_cbc_prologue
++
++ lea .Lcbc_fast_body(%rip),%r10
++ cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
++ jb .Lin_cbc_frame_setup
++
++ lea .Lcbc_slow_prologue(%rip),%r10
++ cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
++ jb .Lin_cbc_body
++
++ lea .Lcbc_slow_body(%rip),%r10
++ cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
++ jb .Lin_cbc_frame_setup
++
++.Lin_cbc_body:
++ mov 152($context),%rax # pull context->Rsp
++
++ lea .Lcbc_epilogue(%rip),%r10
++ cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
++ jae .Lin_cbc_prologue
++
++ lea 8(%rax),%rax
++
++ lea .Lcbc_popfq(%rip),%r10
++ cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
++ jae .Lin_cbc_prologue
++
++ mov `16-8`(%rax),%rax # biased $_rsp
++ lea 56(%rax),%rax
++
++.Lin_cbc_frame_setup:
++ mov -16(%rax),%rbx
++ mov -24(%rax),%rbp
++ mov -32(%rax),%r12
++ mov -40(%rax),%r13
++ mov -48(%rax),%r14
++ mov -56(%rax),%r15
++ mov %rbx,144($context) # restore context->Rbx
++ mov %rbp,160($context) # restore context->Rbp
++ mov %r12,216($context) # restore context->R12
++ mov %r13,224($context) # restore context->R13
++ mov %r14,232($context) # restore context->R14
++ mov %r15,240($context) # restore context->R15
++
++.Lin_cbc_prologue:
++ mov 8(%rax),%rdi
++ mov 16(%rax),%rsi
++ mov %rax,152($context) # restore context->Rsp
++ mov %rsi,168($context) # restore context->Rsi
++ mov %rdi,176($context) # restore context->Rdi
++
++.Lcommon_seh_exit:
++
++ mov 40($disp),%rdi # disp->ContextRecord
++ mov $context,%rsi # context
++ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
++ .long 0xa548f3fc # cld; rep movsq
++
++ mov $disp,%rsi
++ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
++ mov 8(%rsi),%rdx # arg2, disp->ImageBase
++ mov 0(%rsi),%r8 # arg3, disp->ControlPc
++ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
++ mov 40(%rsi),%r10 # disp->ContextRecord
++ lea 56(%rsi),%r11 # &disp->HandlerData
++ lea 24(%rsi),%r12 # &disp->EstablisherFrame
++ mov %r10,32(%rsp) # arg5
++ mov %r11,40(%rsp) # arg6
++ mov %r12,48(%rsp) # arg7
++ mov %rcx,56(%rsp) # arg8, (NULL)
++ call *__imp_RtlVirtualUnwind(%rip)
++
++ mov \$1,%eax # ExceptionContinueSearch
++ add \$64,%rsp
++ popfq
++ pop %r15
++ pop %r14
++ pop %r13
++ pop %r12
++ pop %rbp
++ pop %rbx
++ pop %rdi
++ pop %rsi
++ ret
++.size cbc_se_handler,.-cbc_se_handler
++
++.section .pdata
++.align 4
++ .rva .LSEH_begin_AES_encrypt
++ .rva .LSEH_end_AES_encrypt
++ .rva .LSEH_info_AES_encrypt
++
++ .rva .LSEH_begin_AES_decrypt
++ .rva .LSEH_end_AES_decrypt
++ .rva .LSEH_info_AES_decrypt
++
++ .rva .LSEH_begin_AES_set_encrypt_key
++ .rva .LSEH_end_AES_set_encrypt_key
++ .rva .LSEH_info_AES_set_encrypt_key
++
++ .rva .LSEH_begin_AES_set_decrypt_key
++ .rva .LSEH_end_AES_set_decrypt_key
++ .rva .LSEH_info_AES_set_decrypt_key
++
++ .rva .LSEH_begin_AES_cbc_encrypt
++ .rva .LSEH_end_AES_cbc_encrypt
++ .rva .LSEH_info_AES_cbc_encrypt
++
++.section .xdata
++.align 8
++.LSEH_info_AES_encrypt:
++ .byte 9,0,0,0
++ .rva block_se_handler
++ .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
++.LSEH_info_AES_decrypt:
++ .byte 9,0,0,0
++ .rva block_se_handler
++ .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
++.LSEH_info_AES_set_encrypt_key:
++ .byte 9,0,0,0
++ .rva key_se_handler
++ .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
++.LSEH_info_AES_set_decrypt_key:
++ .byte 9,0,0,0
++ .rva key_se_handler
++ .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
++.LSEH_info_AES_cbc_encrypt:
++ .byte 9,0,0,0
++ .rva cbc_se_handler
++___
++}
++
++$code =~ s/\`([^\`]*)\`/eval($1)/gem;
++
++print $code;
++
++close STDOUT;
+diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
+new file mode 100644
+index 0000000000..e62342729e
+--- /dev/null
++++ b/crypto/aes/asm/bsaes-x86_64.pl
+@@ -0,0 +1,3239 @@
++#! /usr/bin/env perl
++# Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the OpenSSL license (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++
++###################################################################
++### AES-128 [originally in CTR mode] ###
++### bitsliced implementation for Intel Core 2 processors ###
++### requires support of SSE extensions up to SSSE3 ###
++### Author: Emilia Käsper and Peter Schwabe ###
++### Date: 2009-03-19 ###
++### Public domain ###
++### ###
++### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
++### further information. ###
++###################################################################
++#
++# September 2011.
++#
++# Started as transliteration to "perlasm" the original code has
++# undergone following changes:
++#
++# - code was made position-independent;
++# - rounds were folded into a loop resulting in >5x size reduction
++# from 12.5KB to 2.2KB;
++# - above was possibile thanks to mixcolumns() modification that
++# allowed to feed its output back to aesenc[last], this was
++# achieved at cost of two additional inter-registers moves;
++# - some instruction reordering and interleaving;
++# - this module doesn't implement key setup subroutine, instead it
++# relies on conversion of "conventional" key schedule as returned
++# by AES_set_encrypt_key (see discussion below);
++# - first and last round keys are treated differently, which allowed
++# to skip one shiftrows(), reduce bit-sliced key schedule and
++# speed-up conversion by 22%;
++# - support for 192- and 256-bit keys was added;
++#
++# Resulting performance in CPU cycles spent to encrypt one byte out
++# of 4096-byte buffer with 128-bit key is:
++#
++# Emilia's this(*) difference
++#
++# Core 2 9.30 8.69 +7%
++# Nehalem(**) 7.63 6.88 +11%
++# Atom 17.1 16.4 +4%
++# Silvermont - 12.9
++# Goldmont - 8.85
++#
++# (*) Comparison is not completely fair, because "this" is ECB,
++# i.e. no extra processing such as counter values calculation
++# and xor-ing input as in Emilia's CTR implementation is
++# performed. However, the CTR calculations stand for not more
++# than 1% of total time, so comparison is *rather* fair.
++#
++# (**) Results were collected on Westmere, which is considered to
++# be equivalent to Nehalem for this code.
++#
++# As for key schedule conversion subroutine. Interface to OpenSSL
++# relies on per-invocation on-the-fly conversion. This naturally
++# has impact on performance, especially for short inputs. Conversion
++# time in CPU cycles and its ratio to CPU cycles spent in 8x block
++# function is:
++#
++# conversion conversion/8x block
++# Core 2 240 0.22
++# Nehalem 180 0.20
++# Atom 430 0.20
++#
++# The ratio values mean that 128-byte blocks will be processed
++# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
++# etc. Then keep in mind that input sizes not divisible by 128 are
++# *effectively* slower, especially shortest ones, e.g. consecutive
++# 144-byte blocks are processed 44% slower than one would expect,
++# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
++# it's still faster than ["hyper-threading-safe" code path in]
++# aes-x86_64.pl on all lengths above 64 bytes...
++#
++# October 2011.
++#
++# Add decryption procedure. Performance in CPU cycles spent to decrypt
++# one byte out of 4096-byte buffer with 128-bit key is:
++#
++# Core 2 9.98
++# Nehalem 7.80
++# Atom 17.9
++# Silvermont 14.0
++# Goldmont 10.2
++#
++# November 2011.
++#
++# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
++# suboptimal, but XTS is meant to be used with larger blocks...
++#
++# <appro@openssl.org>
++
++$flavour = shift;
++$output = shift;
++if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
++
++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
++die "can't locate x86_64-xlate.pl";
++
++open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
++*STDOUT=*OUT;
++
++my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
++my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
++my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
++
++{
++my ($key,$rounds,$const)=("%rax","%r10d","%r11");
++
++sub Sbox {
++# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
++# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
++my @b=@_[0..7];
++my @t=@_[8..11];
++my @s=@_[12..15];
++ &InBasisChange (@b);
++ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
++ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
++}
++
++sub InBasisChange {
++# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
++# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
++my @b=@_[0..7];
++$code.=<<___;
++ pxor @b[6], @b[5]
++ pxor @b[1], @b[2]
++ pxor @b[0], @b[3]
++ pxor @b[2], @b[6]
++ pxor @b[0], @b[5]
++
++ pxor @b[3], @b[6]
++ pxor @b[7], @b[3]
++ pxor @b[5], @b[7]
++ pxor @b[4], @b[3]
++ pxor @b[5], @b[4]
++ pxor @b[1], @b[3]
++
++ pxor @b[7], @b[2]
++ pxor @b[5], @b[1]
++___
++}
++
++sub OutBasisChange {
++# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
++# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
++my @b=@_[0..7];
++$code.=<<___;
++ pxor @b[6], @b[0]
++ pxor @b[4], @b[1]
++ pxor @b[0], @b[2]
++ pxor @b[6], @b[4]
++ pxor @b[1], @b[6]
++
++ pxor @b[5], @b[1]
++ pxor @b[3], @b[5]
++ pxor @b[7], @b[3]
++ pxor @b[5], @b[7]
++ pxor @b[5], @b[2]
++
++ pxor @b[7], @b[4]
++___
++}
++
++sub InvSbox {
++# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
++# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
++my @b=@_[0..7];
++my @t=@_[8..11];
++my @s=@_[12..15];
++ &InvInBasisChange (@b);
++ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
++ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
++}
++
++sub InvInBasisChange { # OutBasisChange in reverse
++my @b=@_[5,1,2,6,3,7,0,4];
++$code.=<<___
++ pxor @b[7], @b[4]
++
++ pxor @b[5], @b[7]
++ pxor @b[5], @b[2]
++ pxor @b[7], @b[3]
++ pxor @b[3], @b[5]
++ pxor @b[5], @b[1]
++
++ pxor @b[1], @b[6]
++ pxor @b[0], @b[2]
++ pxor @b[6], @b[4]
++ pxor @b[6], @b[0]
++ pxor @b[4], @b[1]
++___
++}
++
++sub InvOutBasisChange { # InBasisChange in reverse
++my @b=@_[2,5,7,3,6,1,0,4];
++$code.=<<___;
++ pxor @b[5], @b[1]
++ pxor @b[7], @b[2]
++
++ pxor @b[1], @b[3]
++ pxor @b[5], @b[4]
++ pxor @b[5], @b[7]
++ pxor @b[4], @b[3]
++ pxor @b[0], @b[5]
++ pxor @b[7], @b[3]
++ pxor @b[2], @b[6]
++ pxor @b[1], @b[2]
++ pxor @b[3], @b[6]
++
++ pxor @b[0], @b[3]
++ pxor @b[6], @b[5]
++___
++}
++
++sub Mul_GF4 {
++#;*************************************************************
++#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
++#;*************************************************************
++my ($x0,$x1,$y0,$y1,$t0)=@_;
++$code.=<<___;
++ movdqa $y0, $t0
++ pxor $y1, $t0
++ pand $x0, $t0
++ pxor $x1, $x0
++ pand $y0, $x1
++ pand $y1, $x0
++ pxor $x1, $x0
++ pxor $t0, $x1
++___
++}
++
++sub Mul_GF4_N { # not used, see next subroutine
++# multiply and scale by N
++my ($x0,$x1,$y0,$y1,$t0)=@_;
++$code.=<<___;
++ movdqa $y0, $t0
++ pxor $y1, $t0
++ pand $x0, $t0
++ pxor $x1, $x0
++ pand $y0, $x1
++ pand $y1, $x0
++ pxor $x0, $x1
++ pxor $t0, $x0
++___
++}
++
++sub Mul_GF4_N_GF4 {
++# interleaved Mul_GF4_N and Mul_GF4
++my ($x0,$x1,$y0,$y1,$t0,
++ $x2,$x3,$y2,$y3,$t1)=@_;
++$code.=<<___;
++ movdqa $y0, $t0
++ movdqa $y2, $t1
++ pxor $y1, $t0
++ pxor $y3, $t1
++ pand $x0, $t0
++ pand $x2, $t1
++ pxor $x1, $x0
++ pxor $x3, $x2
++ pand $y0, $x1
++ pand $y2, $x3
++ pand $y1, $x0
++ pand $y3, $x2
++ pxor $x0, $x1
++ pxor $x3, $x2
++ pxor $t0, $x0
++ pxor $t1, $x3
++___
++}
++sub Mul_GF16_2 {
++my @x=@_[0..7];
++my @y=@_[8..11];
++my @t=@_[12..15];
++$code.=<<___;
++ movdqa @x[0], @t[0]
++ movdqa @x[1], @t[1]
++___
++ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
++$code.=<<___;
++ pxor @x[2], @t[0]
++ pxor @x[3], @t[1]
++ pxor @y[2], @y[0]
++ pxor @y[3], @y[1]
++___
++ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
++ @x[2], @x[3], @y[2], @y[3], @t[2]);
++$code.=<<___;
++ pxor @t[0], @x[0]
++ pxor @t[0], @x[2]
++ pxor @t[1], @x[1]
++ pxor @t[1], @x[3]
++
++ movdqa @x[4], @t[0]
++ movdqa @x[5], @t[1]
++ pxor @x[6], @t[0]
++ pxor @x[7], @t[1]
++___
++ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
++ @x[6], @x[7], @y[2], @y[3], @t[2]);
++$code.=<<___;
++ pxor @y[2], @y[0]
++ pxor @y[3], @y[1]
++___
++ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
++$code.=<<___;
++ pxor @t[0], @x[4]
++ pxor @t[0], @x[6]
++ pxor @t[1], @x[5]
++ pxor @t[1], @x[7]
++___
++}
++sub Inv_GF256 {
++#;********************************************************************
++#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
++#;********************************************************************
++my @x=@_[0..7];
++my @t=@_[8..11];
++my @s=@_[12..15];
++# direct optimizations from hardware
++$code.=<<___;
++ movdqa @x[4], @t[3]
++ movdqa @x[5], @t[2]
++ movdqa @x[1], @t[1]
++ movdqa @x[7], @s[1]
++ movdqa @x[0], @s[0]
++
++ pxor @x[6], @t[3]
++ pxor @x[7], @t[2]
++ pxor @x[3], @t[1]
++ movdqa @t[3], @s[2]
++ pxor @x[6], @s[1]
++ movdqa @t[2], @t[0]
++ pxor @x[2], @s[0]
++ movdqa @t[3], @s[3]
++
++ por @t[1], @t[2]
++ por @s[0], @t[3]
++ pxor @t[0], @s[3]
++ pand @s[0], @s[2]
++ pxor @t[1], @s[0]
++ pand @t[1], @t[0]
++ pand @s[0], @s[3]
++ movdqa @x[3], @s[0]
++ pxor @x[2], @s[0]
++ pand @s[0], @s[1]
++ pxor @s[1], @t[3]
++ pxor @s[1], @t[2]
++ movdqa @x[4], @s[1]
++ movdqa @x[1], @s[0]
++ pxor @x[5], @s[1]
++ pxor @x[0], @s[0]
++ movdqa @s[1], @t[1]
++ pand @s[0], @s[1]
++ por @s[0], @t[1]
++ pxor @s[1], @t[0]
++ pxor @s[3], @t[3]
++ pxor @s[2], @t[2]
++ pxor @s[3], @t[1]
++ movdqa @x[7], @s[0]
++ pxor @s[2], @t[0]
++ movdqa @x[6], @s[1]
++ pxor @s[2], @t[1]
++ movdqa @x[5], @s[2]
++ pand @x[3], @s[0]
++ movdqa @x[4], @s[3]
++ pand @x[2], @s[1]
++ pand @x[1], @s[2]
++ por @x[0], @s[3]
++ pxor @s[0], @t[3]
++ pxor @s[1], @t[2]
++ pxor @s[2], @t[1]
++ pxor @s[3], @t[0]
++
++ #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
++
++ # new smaller inversion
++
++ movdqa @t[3], @s[0]
++ pand @t[1], @t[3]
++ pxor @t[2], @s[0]
++
++ movdqa @t[0], @s[2]
++ movdqa @s[0], @s[3]
++ pxor @t[3], @s[2]
++ pand @s[2], @s[3]
++
++ movdqa @t[1], @s[1]
++ pxor @t[2], @s[3]
++ pxor @t[0], @s[1]
++
++ pxor @t[2], @t[3]
++
++ pand @t[3], @s[1]
++
++ movdqa @s[2], @t[2]
++ pxor @t[0], @s[1]
++
++ pxor @s[1], @t[2]
++ pxor @s[1], @t[1]
++
++ pand @t[0], @t[2]
++
++ pxor @t[2], @s[2]
++ pxor @t[2], @t[1]
++
++ pand @s[3], @s[2]
++
++ pxor @s[0], @s[2]
++___
++# output in s3, s2, s1, t1
++
++# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
++
++# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
++ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
++
++### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
++}
++
++# AES linear components
++
++sub ShiftRows {
++my @x=@_[0..7];
++my $mask=pop;
++$code.=<<___;
++ pxor 0x00($key),@x[0]
++ pxor 0x10($key),@x[1]
++ pxor 0x20($key),@x[2]
++ pxor 0x30($key),@x[3]
++ pshufb $mask,@x[0]
++ pshufb $mask,@x[1]
++ pxor 0x40($key),@x[4]
++ pxor 0x50($key),@x[5]
++ pshufb $mask,@x[2]
++ pshufb $mask,@x[3]
++ pxor 0x60($key),@x[6]
++ pxor 0x70($key),@x[7]
++ pshufb $mask,@x[4]
++ pshufb $mask,@x[5]
++ pshufb $mask,@x[6]
++ pshufb $mask,@x[7]
++ lea 0x80($key),$key
++___
++}
++
++sub MixColumns {
++# modified to emit output in order suitable for feeding back to aesenc[last]
++my @x=@_[0..7];
++my @t=@_[8..15];
++my $inv=@_[16]; # optional
++$code.=<<___;
++ pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
++ pshufd \$0x93, @x[1], @t[1]
++ pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
++ pshufd \$0x93, @x[2], @t[2]
++ pxor @t[1], @x[1]
++ pshufd \$0x93, @x[3], @t[3]
++ pxor @t[2], @x[2]
++ pshufd \$0x93, @x[4], @t[4]
++ pxor @t[3], @x[3]
++ pshufd \$0x93, @x[5], @t[5]
++ pxor @t[4], @x[4]
++ pshufd \$0x93, @x[6], @t[6]
++ pxor @t[5], @x[5]
++ pshufd \$0x93, @x[7], @t[7]
++ pxor @t[6], @x[6]
++ pxor @t[7], @x[7]
++
++ pxor @x[0], @t[1]
++ pxor @x[7], @t[0]
++ pxor @x[7], @t[1]
++ pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
++ pxor @x[1], @t[2]
++ pshufd \$0x4E, @x[1], @x[1]
++ pxor @x[4], @t[5]
++ pxor @t[0], @x[0]
++ pxor @x[5], @t[6]
++ pxor @t[1], @x[1]
++ pxor @x[3], @t[4]
++ pshufd \$0x4E, @x[4], @t[0]
++ pxor @x[6], @t[7]
++ pshufd \$0x4E, @x[5], @t[1]
++ pxor @x[2], @t[3]
++ pshufd \$0x4E, @x[3], @x[4]
++ pxor @x[7], @t[3]
++ pshufd \$0x4E, @x[7], @x[5]
++ pxor @x[7], @t[4]
++ pshufd \$0x4E, @x[6], @x[3]
++ pxor @t[4], @t[0]
++ pshufd \$0x4E, @x[2], @x[6]
++ pxor @t[5], @t[1]
++___
++$code.=<<___ if (!$inv);
++ pxor @t[3], @x[4]
++ pxor @t[7], @x[5]
++ pxor @t[6], @x[3]
++ movdqa @t[0], @x[2]
++ pxor @t[2], @x[6]
++ movdqa @t[1], @x[7]
++___
++$code.=<<___ if ($inv);
++ pxor @x[4], @t[3]
++ pxor @t[7], @x[5]
++ pxor @x[3], @t[6]
++ movdqa @t[0], @x[3]
++ pxor @t[2], @x[6]
++ movdqa @t[6], @x[2]
++ movdqa @t[1], @x[7]
++ movdqa @x[6], @x[4]
++ movdqa @t[3], @x[6]
++___
++}
++
++sub InvMixColumns_orig {
++my @x=@_[0..7];
++my @t=@_[8..15];
++
++$code.=<<___;
++ # multiplication by 0x0e
++ pshufd \$0x93, @x[7], @t[7]
++ movdqa @x[2], @t[2]
++ pxor @x[5], @x[7] # 7 5
++ pxor @x[5], @x[2] # 2 5
++ pshufd \$0x93, @x[0], @t[0]
++ movdqa @x[5], @t[5]
++ pxor @x[0], @x[5] # 5 0 [1]
++ pxor @x[1], @x[0] # 0 1
++ pshufd \$0x93, @x[1], @t[1]
++ pxor @x[2], @x[1] # 1 25
++ pxor @x[6], @x[0] # 01 6 [2]
++ pxor @x[3], @x[1] # 125 3 [4]
++ pshufd \$0x93, @x[3], @t[3]
++ pxor @x[0], @x[2] # 25 016 [3]
++ pxor @x[7], @x[3] # 3 75
++ pxor @x[6], @x[7] # 75 6 [0]
++ pshufd \$0x93, @x[6], @t[6]
++ movdqa @x[4], @t[4]
++ pxor @x[4], @x[6] # 6 4
++ pxor @x[3], @x[4] # 4 375 [6]
++ pxor @x[7], @x[3] # 375 756=36
++ pxor @t[5], @x[6] # 64 5 [7]
++ pxor @t[2], @x[3] # 36 2
++ pxor @t[4], @x[3] # 362 4 [5]
++ pshufd \$0x93, @t[5], @t[5]
++___
++ my @y = @x[7,5,0,2,1,3,4,6];
++$code.=<<___;
++ # multiplication by 0x0b
++ pxor @y[0], @y[1]
++ pxor @t[0], @y[0]
++ pxor @t[1], @y[1]
++ pshufd \$0x93, @t[2], @t[2]
++ pxor @t[5], @y[0]
++ pxor @t[6], @y[1]
++ pxor @t[7], @y[0]
++ pshufd \$0x93, @t[4], @t[4]
++ pxor @t[6], @t[7] # clobber t[7]
++ pxor @y[0], @y[1]
++
++ pxor @t[0], @y[3]
++ pshufd \$0x93, @t[0], @t[0]
++ pxor @t[1], @y[2]
++ pxor @t[1], @y[4]
++ pxor @t[2], @y[2]
++ pshufd \$0x93, @t[1], @t[1]
++ pxor @t[2], @y[3]
++ pxor @t[2], @y[5]
++ pxor @t[7], @y[2]
++ pshufd \$0x93, @t[2], @t[2]
++ pxor @t[3], @y[3]
++ pxor @t[3], @y[6]
++ pxor @t[3], @y[4]
++ pshufd \$0x93, @t[3], @t[3]
++ pxor @t[4], @y[7]
++ pxor @t[4], @y[5]
++ pxor @t[7], @y[7]
++ pxor @t[5], @y[3]
++ pxor @t[4], @y[4]
++ pxor @t[5], @t[7] # clobber t[7] even more
++
++ pxor @t[7], @y[5]
++ pshufd \$0x93, @t[4], @t[4]
++ pxor @t[7], @y[6]
++ pxor @t[7], @y[4]
++
++ pxor @t[5], @t[7]
++ pshufd \$0x93, @t[5], @t[5]
++ pxor @t[6], @t[7] # restore t[7]
++
++ # multiplication by 0x0d
++ pxor @y[7], @y[4]
++ pxor @t[4], @y[7]
++ pshufd \$0x93, @t[6], @t[6]
++ pxor @t[0], @y[2]
++ pxor @t[5], @y[7]
++ pxor @t[2], @y[2]
++ pshufd \$0x93, @t[7], @t[7]
++
++ pxor @y[1], @y[3]
++ pxor @t[1], @y[1]
++ pxor @t[0], @y[0]
++ pxor @t[0], @y[3]
++ pxor @t[5], @y[1]
++ pxor @t[5], @y[0]
++ pxor @t[7], @y[1]
++ pshufd \$0x93, @t[0], @t[0]
++ pxor @t[6], @y[0]
++ pxor @y[1], @y[3]
++ pxor @t[1], @y[4]
++ pshufd \$0x93, @t[1], @t[1]
++
++ pxor @t[7], @y[7]
++ pxor @t[2], @y[4]
++ pxor @t[2], @y[5]
++ pshufd \$0x93, @t[2], @t[2]
++ pxor @t[6], @y[2]
++ pxor @t[3], @t[6] # clobber t[6]
++ pxor @y[7], @y[4]
++ pxor @t[6], @y[3]
++
++ pxor @t[6], @y[6]
++ pxor @t[5], @y[5]
++ pxor @t[4], @y[6]
++ pshufd \$0x93, @t[4], @t[4]
++ pxor @t[6], @y[5]
++ pxor @t[7], @y[6]
++ pxor @t[3], @t[6] # restore t[6]
++
++ pshufd \$0x93, @t[5], @t[5]
++ pshufd \$0x93, @t[6], @t[6]
++ pshufd \$0x93, @t[7], @t[7]
++ pshufd \$0x93, @t[3], @t[3]
++
++ # multiplication by 0x09
++ pxor @y[1], @y[4]
++ pxor @y[1], @t[1] # t[1]=y[1]
++ pxor @t[5], @t[0] # clobber t[0]
++ pxor @t[5], @t[1]
++ pxor @t[0], @y[3]
++ pxor @y[0], @t[0] # t[0]=y[0]
++ pxor @t[6], @t[1]
++ pxor @t[7], @t[6] # clobber t[6]
++ pxor @t[1], @y[4]
++ pxor @t[4], @y[7]
++ pxor @y[4], @t[4] # t[4]=y[4]
++ pxor @t[3], @y[6]
++ pxor @y[3], @t[3] # t[3]=y[3]
++ pxor @t[2], @y[5]
++ pxor @y[2], @t[2] # t[2]=y[2]
++ pxor @t[7], @t[3]
++ pxor @y[5], @t[5] # t[5]=y[5]
++ pxor @t[6], @t[2]
++ pxor @t[6], @t[5]
++ pxor @y[6], @t[6] # t[6]=y[6]
++ pxor @y[7], @t[7] # t[7]=y[7]
++
++ movdqa @t[0],@XMM[0]
++ movdqa @t[1],@XMM[1]
++ movdqa @t[2],@XMM[2]
++ movdqa @t[3],@XMM[3]
++ movdqa @t[4],@XMM[4]
++ movdqa @t[5],@XMM[5]
++ movdqa @t[6],@XMM[6]
++ movdqa @t[7],@XMM[7]
++___
++}
++
++sub InvMixColumns {
++my @x=@_[0..7];
++my @t=@_[8..15];
++
++# Thanks to Jussi Kivilinna for providing pointer to
++#
++# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
++# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
++# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
++# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
++
++$code.=<<___;
++ # multiplication by 0x05-0x00-0x04-0x00
++ pshufd \$0x4E, @x[0], @t[0]
++ pshufd \$0x4E, @x[6], @t[6]
++ pxor @x[0], @t[0]
++ pshufd \$0x4E, @x[7], @t[7]
++ pxor @x[6], @t[6]
++ pshufd \$0x4E, @x[1], @t[1]
++ pxor @x[7], @t[7]
++ pshufd \$0x4E, @x[2], @t[2]
++ pxor @x[1], @t[1]
++ pshufd \$0x4E, @x[3], @t[3]
++ pxor @x[2], @t[2]
++ pxor @t[6], @x[0]
++ pxor @t[6], @x[1]
++ pshufd \$0x4E, @x[4], @t[4]
++ pxor @x[3], @t[3]
++ pxor @t[0], @x[2]
++ pxor @t[1], @x[3]
++ pshufd \$0x4E, @x[5], @t[5]
++ pxor @x[4], @t[4]
++ pxor @t[7], @x[1]
++ pxor @t[2], @x[4]
++ pxor @x[5], @t[5]
++
++ pxor @t[7], @x[2]
++ pxor @t[6], @x[3]
++ pxor @t[6], @x[4]
++ pxor @t[3], @x[5]
++ pxor @t[4], @x[6]
++ pxor @t[7], @x[4]
++ pxor @t[7], @x[5]
++ pxor @t[5], @x[7]
++___
++ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
++}
++
++sub aesenc { # not used
++my @b=@_[0..7];
++my @t=@_[8..15];
++$code.=<<___;
++ movdqa 0x30($const),@t[0] # .LSR
++___
++ &ShiftRows (@b,@t[0]);
++ &Sbox (@b,@t);
++ &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
++}
++
++sub aesenclast { # not used
++my @b=@_[0..7];
++my @t=@_[8..15];
++$code.=<<___;
++ movdqa 0x40($const),@t[0] # .LSRM0
++___
++ &ShiftRows (@b,@t[0]);
++ &Sbox (@b,@t);
++$code.=<<___
++ pxor 0x00($key),@b[0]
++ pxor 0x10($key),@b[1]
++ pxor 0x20($key),@b[4]
++ pxor 0x30($key),@b[6]
++ pxor 0x40($key),@b[3]
++ pxor 0x50($key),@b[7]
++ pxor 0x60($key),@b[2]
++ pxor 0x70($key),@b[5]
++___
++}
++
++sub swapmove {
++my ($a,$b,$n,$mask,$t)=@_;
++$code.=<<___;
++ movdqa $b,$t
++ psrlq \$$n,$b
++ pxor $a,$b
++ pand $mask,$b
++ pxor $b,$a
++ psllq \$$n,$b
++ pxor $t,$b
++___
++}
++sub swapmove2x {
++my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
++$code.=<<___;
++ movdqa $b0,$t0
++ psrlq \$$n,$b0
++ movdqa $b1,$t1
++ psrlq \$$n,$b1
++ pxor $a0,$b0
++ pxor $a1,$b1
++ pand $mask,$b0
++ pand $mask,$b1
++ pxor $b0,$a0
++ psllq \$$n,$b0
++ pxor $b1,$a1
++ psllq \$$n,$b1
++ pxor $t0,$b0
++ pxor $t1,$b1
++___
++}
++
++sub bitslice {
++my @x=reverse(@_[0..7]);
++my ($t0,$t1,$t2,$t3)=@_[8..11];
++$code.=<<___;
++ movdqa 0x00($const),$t0 # .LBS0
++ movdqa 0x10($const),$t1 # .LBS1
++___
++ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
++ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
++$code.=<<___;
++ movdqa 0x20($const),$t0 # .LBS2
++___
++ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
++ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
++
++ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
++ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
++}
++
++$code.=<<___;
++.text
++
++.extern asm_AES_encrypt
++.extern asm_AES_decrypt
++
++.type _bsaes_encrypt8,\@abi-omnipotent
++.align 64
++_bsaes_encrypt8:
++.cfi_startproc
++ lea .LBS0(%rip), $const # constants table
++
++ movdqa ($key), @XMM[9] # round 0 key
++ lea 0x10($key), $key
++ movdqa 0x50($const), @XMM[8] # .LM0SR
++ pxor @XMM[9], @XMM[0] # xor with round0 key
++ pxor @XMM[9], @XMM[1]
++ pxor @XMM[9], @XMM[2]
++ pxor @XMM[9], @XMM[3]
++ pshufb @XMM[8], @XMM[0]
++ pshufb @XMM[8], @XMM[1]
++ pxor @XMM[9], @XMM[4]
++ pxor @XMM[9], @XMM[5]
++ pshufb @XMM[8], @XMM[2]
++ pshufb @XMM[8], @XMM[3]
++ pxor @XMM[9], @XMM[6]
++ pxor @XMM[9], @XMM[7]
++ pshufb @XMM[8], @XMM[4]
++ pshufb @XMM[8], @XMM[5]
++ pshufb @XMM[8], @XMM[6]
++ pshufb @XMM[8], @XMM[7]
++_bsaes_encrypt8_bitslice:
++___
++ &bitslice (@XMM[0..7, 8..11]);
++$code.=<<___;
++ dec $rounds
++ jmp .Lenc_sbox
++.align 16
++.Lenc_loop:
++___
++ &ShiftRows (@XMM[0..7, 8]);
++$code.=".Lenc_sbox:\n";
++ &Sbox (@XMM[0..7, 8..15]);
++$code.=<<___;
++ dec $rounds
++ jl .Lenc_done
++___
++ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
++$code.=<<___;
++ movdqa 0x30($const), @XMM[8] # .LSR
++ jnz .Lenc_loop
++ movdqa 0x40($const), @XMM[8] # .LSRM0
++ jmp .Lenc_loop
++.align 16
++.Lenc_done:
++___
++ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
++ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
++$code.=<<___;
++ movdqa ($key), @XMM[8] # last round key
++ pxor @XMM[8], @XMM[4]
++ pxor @XMM[8], @XMM[6]
++ pxor @XMM[8], @XMM[3]
++ pxor @XMM[8], @XMM[7]
++ pxor @XMM[8], @XMM[2]
++ pxor @XMM[8], @XMM[5]
++ pxor @XMM[8], @XMM[0]
++ pxor @XMM[8], @XMM[1]
++ ret
++.cfi_endproc
++.size _bsaes_encrypt8,.-_bsaes_encrypt8
++
++.type _bsaes_decrypt8,\@abi-omnipotent
++.align 64
++_bsaes_decrypt8:
++.cfi_startproc
++ lea .LBS0(%rip), $const # constants table
++
++ movdqa ($key), @XMM[9] # round 0 key
++ lea 0x10($key), $key
++ movdqa -0x30($const), @XMM[8] # .LM0ISR
++ pxor @XMM[9], @XMM[0] # xor with round0 key
++ pxor @XMM[9], @XMM[1]
++ pxor @XMM[9], @XMM[2]
++ pxor @XMM[9], @XMM[3]
++ pshufb @XMM[8], @XMM[0]
++ pshufb @XMM[8], @XMM[1]
++ pxor @XMM[9], @XMM[4]
++ pxor @XMM[9], @XMM[5]
++ pshufb @XMM[8], @XMM[2]
++ pshufb @XMM[8], @XMM[3]
++ pxor @XMM[9], @XMM[6]
++ pxor @XMM[9], @XMM[7]
++ pshufb @XMM[8], @XMM[4]
++ pshufb @XMM[8], @XMM[5]
++ pshufb @XMM[8], @XMM[6]
++ pshufb @XMM[8], @XMM[7]
++___
++ &bitslice (@XMM[0..7, 8..11]);
++$code.=<<___;
++ dec $rounds
++ jmp .Ldec_sbox
++.align 16
++.Ldec_loop:
++___
++ &ShiftRows (@XMM[0..7, 8]);
++$code.=".Ldec_sbox:\n";
++ &InvSbox (@XMM[0..7, 8..15]);
++$code.=<<___;
++ dec $rounds
++ jl .Ldec_done
++___
++ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
++$code.=<<___;
++ movdqa -0x10($const), @XMM[8] # .LISR
++ jnz .Ldec_loop
++ movdqa -0x20($const), @XMM[8] # .LISRM0
++ jmp .Ldec_loop
++.align 16
++.Ldec_done:
++___
++ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
++$code.=<<___;
++ movdqa ($key), @XMM[8] # last round key
++ pxor @XMM[8], @XMM[6]
++ pxor @XMM[8], @XMM[4]
++ pxor @XMM[8], @XMM[2]
++ pxor @XMM[8], @XMM[7]
++ pxor @XMM[8], @XMM[3]
++ pxor @XMM[8], @XMM[5]
++ pxor @XMM[8], @XMM[0]
++ pxor @XMM[8], @XMM[1]
++ ret
++.cfi_endproc
++.size _bsaes_decrypt8,.-_bsaes_decrypt8
++___
++}
++{
++my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
++
++sub bitslice_key {
++my @x=reverse(@_[0..7]);
++my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
++
++ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
++$code.=<<___;
++ #&swapmove(@x[2,3],1,$t0,$t2,$t3);
++ movdqa @x[0], @x[2]
++ movdqa @x[1], @x[3]
++___
++ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
++
++ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
++$code.=<<___;
++ #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
++ movdqa @x[0], @x[4]
++ movdqa @x[2], @x[6]
++ movdqa @x[1], @x[5]
++ movdqa @x[3], @x[7]
++___
++ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
++ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
++}
++
++$code.=<<___;
++.type _bsaes_key_convert,\@abi-omnipotent
++.align 16
++_bsaes_key_convert:
++.cfi_startproc
++ lea .Lmasks(%rip), $const
++ movdqu ($inp), %xmm7 # load round 0 key
++ lea 0x10($inp), $inp
++ movdqa 0x00($const), %xmm0 # 0x01...
++ movdqa 0x10($const), %xmm1 # 0x02...
++ movdqa 0x20($const), %xmm2 # 0x04...
++ movdqa 0x30($const), %xmm3 # 0x08...
++ movdqa 0x40($const), %xmm4 # .LM0
++ pcmpeqd %xmm5, %xmm5 # .LNOT
++
++ movdqu ($inp), %xmm6 # load round 1 key
++ movdqa %xmm7, ($out) # save round 0 key
++ lea 0x10($out), $out
++ dec $rounds
++ jmp .Lkey_loop
++.align 16
++.Lkey_loop:
++ pshufb %xmm4, %xmm6 # .LM0
++
++ movdqa %xmm0, %xmm8
++ movdqa %xmm1, %xmm9
++
++ pand %xmm6, %xmm8
++ pand %xmm6, %xmm9
++ movdqa %xmm2, %xmm10
++ pcmpeqb %xmm0, %xmm8
++ psllq \$4, %xmm0 # 0x10...
++ movdqa %xmm3, %xmm11
++ pcmpeqb %xmm1, %xmm9
++ psllq \$4, %xmm1 # 0x20...
++
++ pand %xmm6, %xmm10
++ pand %xmm6, %xmm11
++ movdqa %xmm0, %xmm12
++ pcmpeqb %xmm2, %xmm10
++ psllq \$4, %xmm2 # 0x40...
++ movdqa %xmm1, %xmm13
++ pcmpeqb %xmm3, %xmm11
++ psllq \$4, %xmm3 # 0x80...
++
++ movdqa %xmm2, %xmm14
++ movdqa %xmm3, %xmm15
++ pxor %xmm5, %xmm8 # "pnot"
++ pxor %xmm5, %xmm9
++
++ pand %xmm6, %xmm12
++ pand %xmm6, %xmm13
++ movdqa %xmm8, 0x00($out) # write bit-sliced round key
++ pcmpeqb %xmm0, %xmm12
++ psrlq \$4, %xmm0 # 0x01...
++ movdqa %xmm9, 0x10($out)
++ pcmpeqb %xmm1, %xmm13
++ psrlq \$4, %xmm1 # 0x02...
++ lea 0x10($inp), $inp
++
++ pand %xmm6, %xmm14
++ pand %xmm6, %xmm15
++ movdqa %xmm10, 0x20($out)
++ pcmpeqb %xmm2, %xmm14
++ psrlq \$4, %xmm2 # 0x04...
++ movdqa %xmm11, 0x30($out)
++ pcmpeqb %xmm3, %xmm15
++ psrlq \$4, %xmm3 # 0x08...
++ movdqu ($inp), %xmm6 # load next round key
++
++ pxor %xmm5, %xmm13 # "pnot"
++ pxor %xmm5, %xmm14
++ movdqa %xmm12, 0x40($out)
++ movdqa %xmm13, 0x50($out)
++ movdqa %xmm14, 0x60($out)
++ movdqa %xmm15, 0x70($out)
++ lea 0x80($out),$out
++ dec $rounds
++ jnz .Lkey_loop
++
++ movdqa 0x50($const), %xmm7 # .L63
++ #movdqa %xmm6, ($out) # don't save last round key
++ ret
++.cfi_endproc
++.size _bsaes_key_convert,.-_bsaes_key_convert
++___
++}
++
++if (0 && !$win64) { # following four functions are unsupported interface
++ # used for benchmarking...
++$code.=<<___;
++.globl bsaes_enc_key_convert
++.type bsaes_enc_key_convert,\@function,2
++.align 16
++bsaes_enc_key_convert:
++ mov 240($inp),%r10d # pass rounds
++ mov $inp,%rcx # pass key
++ mov $out,%rax # pass key schedule
++ call _bsaes_key_convert
++ pxor %xmm6,%xmm7 # fix up last round key
++ movdqa %xmm7,(%rax) # save last round key
++ ret
++.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
++
++.globl bsaes_encrypt_128
++.type bsaes_encrypt_128,\@function,4
++.align 16
++bsaes_encrypt_128:
++.Lenc128_loop:
++ movdqu 0x00($inp), @XMM[0] # load input
++ movdqu 0x10($inp), @XMM[1]
++ movdqu 0x20($inp), @XMM[2]
++ movdqu 0x30($inp), @XMM[3]
++ movdqu 0x40($inp), @XMM[4]
++ movdqu 0x50($inp), @XMM[5]
++ movdqu 0x60($inp), @XMM[6]
++ movdqu 0x70($inp), @XMM[7]
++ mov $key, %rax # pass the $key
++ lea 0x80($inp), $inp
++ mov \$10,%r10d
++
++ call _bsaes_encrypt8
++
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ movdqu @XMM[6], 0x30($out)
++ movdqu @XMM[3], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[2], 0x60($out)
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++ sub \$0x80,$len
++ ja .Lenc128_loop
++ ret
++.size bsaes_encrypt_128,.-bsaes_encrypt_128
++
++.globl bsaes_dec_key_convert
++.type bsaes_dec_key_convert,\@function,2
++.align 16
++bsaes_dec_key_convert:
++ mov 240($inp),%r10d # pass rounds
++ mov $inp,%rcx # pass key
++ mov $out,%rax # pass key schedule
++ call _bsaes_key_convert
++ pxor ($out),%xmm7 # fix up round 0 key
++ movdqa %xmm6,(%rax) # save last round key
++ movdqa %xmm7,($out)
++ ret
++.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
++
++.globl bsaes_decrypt_128
++.type bsaes_decrypt_128,\@function,4
++.align 16
++bsaes_decrypt_128:
++.Ldec128_loop:
++ movdqu 0x00($inp), @XMM[0] # load input
++ movdqu 0x10($inp), @XMM[1]
++ movdqu 0x20($inp), @XMM[2]
++ movdqu 0x30($inp), @XMM[3]
++ movdqu 0x40($inp), @XMM[4]
++ movdqu 0x50($inp), @XMM[5]
++ movdqu 0x60($inp), @XMM[6]
++ movdqu 0x70($inp), @XMM[7]
++ mov $key, %rax # pass the $key
++ lea 0x80($inp), $inp
++ mov \$10,%r10d
++
++ call _bsaes_decrypt8
++
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[3], 0x60($out)
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++ sub \$0x80,$len
++ ja .Ldec128_loop
++ ret
++.size bsaes_decrypt_128,.-bsaes_decrypt_128
++___
++}
++{
++######################################################################
++#
++# OpenSSL interface
++#
++my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
++ : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
++my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
++
++if ($ecb) {
++$code.=<<___;
++.globl bsaes_ecb_encrypt_blocks
++.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
++.align 16
++bsaes_ecb_encrypt_blocks:
++.cfi_startproc
++ mov %rsp, %rax
++.Lecb_enc_prologue:
++ push %rbp
++.cfi_push %rbp
++ push %rbx
++.cfi_push %rbx
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ lea -0x48(%rsp),%rsp
++.cfi_adjust_cfa_offset 0x48
++___
++$code.=<<___ if ($win64);
++ lea -0xa0(%rsp), %rsp
++ movaps %xmm6, 0x40(%rsp)
++ movaps %xmm7, 0x50(%rsp)
++ movaps %xmm8, 0x60(%rsp)
++ movaps %xmm9, 0x70(%rsp)
++ movaps %xmm10, 0x80(%rsp)
++ movaps %xmm11, 0x90(%rsp)
++ movaps %xmm12, 0xa0(%rsp)
++ movaps %xmm13, 0xb0(%rsp)
++ movaps %xmm14, 0xc0(%rsp)
++ movaps %xmm15, 0xd0(%rsp)
++.Lecb_enc_body:
++___
++$code.=<<___;
++ mov %rsp,%rbp # backup %rsp
++.cfi_def_cfa_register %rbp
++ mov 240($arg4),%eax # rounds
++ mov $arg1,$inp # backup arguments
++ mov $arg2,$out
++ mov $arg3,$len
++ mov $arg4,$key
++ cmp \$8,$arg3
++ jb .Lecb_enc_short
++
++ mov %eax,%ebx # backup rounds
++ shl \$7,%rax # 128 bytes per inner round key
++ sub \$`128-32`,%rax # size of bit-sliced key schedule
++ sub %rax,%rsp
++ mov %rsp,%rax # pass key schedule
++ mov $key,%rcx # pass key
++ mov %ebx,%r10d # pass rounds
++ call _bsaes_key_convert
++ pxor %xmm6,%xmm7 # fix up last round key
++ movdqa %xmm7,(%rax) # save last round key
++
++ sub \$8,$len
++.Lecb_enc_loop:
++ movdqu 0x00($inp), @XMM[0] # load input
++ movdqu 0x10($inp), @XMM[1]
++ movdqu 0x20($inp), @XMM[2]
++ movdqu 0x30($inp), @XMM[3]
++ movdqu 0x40($inp), @XMM[4]
++ movdqu 0x50($inp), @XMM[5]
++ mov %rsp, %rax # pass key schedule
++ movdqu 0x60($inp), @XMM[6]
++ mov %ebx,%r10d # pass rounds
++ movdqu 0x70($inp), @XMM[7]
++ lea 0x80($inp), $inp
++
++ call _bsaes_encrypt8
++
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ movdqu @XMM[6], 0x30($out)
++ movdqu @XMM[3], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[2], 0x60($out)
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++ sub \$8,$len
++ jnc .Lecb_enc_loop
++
++ add \$8,$len
++ jz .Lecb_enc_done
++
++ movdqu 0x00($inp), @XMM[0] # load input
++ mov %rsp, %rax # pass key schedule
++ mov %ebx,%r10d # pass rounds
++ cmp \$2,$len
++ jb .Lecb_enc_one
++ movdqu 0x10($inp), @XMM[1]
++ je .Lecb_enc_two
++ movdqu 0x20($inp), @XMM[2]
++ cmp \$4,$len
++ jb .Lecb_enc_three
++ movdqu 0x30($inp), @XMM[3]
++ je .Lecb_enc_four
++ movdqu 0x40($inp), @XMM[4]
++ cmp \$6,$len
++ jb .Lecb_enc_five
++ movdqu 0x50($inp), @XMM[5]
++ je .Lecb_enc_six
++ movdqu 0x60($inp), @XMM[6]
++ call _bsaes_encrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ movdqu @XMM[6], 0x30($out)
++ movdqu @XMM[3], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[2], 0x60($out)
++ jmp .Lecb_enc_done
++.align 16
++.Lecb_enc_six:
++ call _bsaes_encrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ movdqu @XMM[6], 0x30($out)
++ movdqu @XMM[3], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ jmp .Lecb_enc_done
++.align 16
++.Lecb_enc_five:
++ call _bsaes_encrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ movdqu @XMM[6], 0x30($out)
++ movdqu @XMM[3], 0x40($out)
++ jmp .Lecb_enc_done
++.align 16
++.Lecb_enc_four:
++ call _bsaes_encrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ movdqu @XMM[6], 0x30($out)
++ jmp .Lecb_enc_done
++.align 16
++.Lecb_enc_three:
++ call _bsaes_encrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ jmp .Lecb_enc_done
++.align 16
++.Lecb_enc_two:
++ call _bsaes_encrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ jmp .Lecb_enc_done
++.align 16
++.Lecb_enc_one:
++ call _bsaes_encrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ jmp .Lecb_enc_done
++.align 16
++.Lecb_enc_short:
++ lea ($inp), $arg1
++ lea ($out), $arg2
++ lea ($key), $arg3
++ call asm_AES_encrypt
++ lea 16($inp), $inp
++ lea 16($out), $out
++ dec $len
++ jnz .Lecb_enc_short
++
++.Lecb_enc_done:
++ lea (%rsp),%rax
++ pxor %xmm0, %xmm0
++.Lecb_enc_bzero: # wipe key schedule [if any]
++ movdqa %xmm0, 0x00(%rax)
++ movdqa %xmm0, 0x10(%rax)
++ lea 0x20(%rax), %rax
++ cmp %rax, %rbp
++ jb .Lecb_enc_bzero
++
++ lea 0x78(%rbp),%rax
++.cfi_def_cfa %rax,8
++___
++$code.=<<___ if ($win64);
++ movaps 0x40(%rbp), %xmm6
++ movaps 0x50(%rbp), %xmm7
++ movaps 0x60(%rbp), %xmm8
++ movaps 0x70(%rbp), %xmm9
++ movaps 0x80(%rbp), %xmm10
++ movaps 0x90(%rbp), %xmm11
++ movaps 0xa0(%rbp), %xmm12
++ movaps 0xb0(%rbp), %xmm13
++ movaps 0xc0(%rbp), %xmm14
++ movaps 0xd0(%rbp), %xmm15
++ lea 0xa0(%rax), %rax
++.Lecb_enc_tail:
++___
++$code.=<<___;
++ mov -48(%rax), %r15
++.cfi_restore %r15
++ mov -40(%rax), %r14
++.cfi_restore %r14
++ mov -32(%rax), %r13
++.cfi_restore %r13
++ mov -24(%rax), %r12
++.cfi_restore %r12
++ mov -16(%rax), %rbx
++.cfi_restore %rbx
++ mov -8(%rax), %rbp
++.cfi_restore %rbp
++ lea (%rax), %rsp # restore %rsp
++.cfi_def_cfa_register %rsp
++.Lecb_enc_epilogue:
++ ret
++.cfi_endproc
++.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
++
++.globl bsaes_ecb_decrypt_blocks
++.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
++.align 16
++bsaes_ecb_decrypt_blocks:
++.cfi_startproc
++ mov %rsp, %rax
++.Lecb_dec_prologue:
++ push %rbp
++.cfi_push %rbp
++ push %rbx
++.cfi_push %rbx
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ lea -0x48(%rsp),%rsp
++.cfi_adjust_cfa_offset 0x48
++___
++$code.=<<___ if ($win64);
++ lea -0xa0(%rsp), %rsp
++ movaps %xmm6, 0x40(%rsp)
++ movaps %xmm7, 0x50(%rsp)
++ movaps %xmm8, 0x60(%rsp)
++ movaps %xmm9, 0x70(%rsp)
++ movaps %xmm10, 0x80(%rsp)
++ movaps %xmm11, 0x90(%rsp)
++ movaps %xmm12, 0xa0(%rsp)
++ movaps %xmm13, 0xb0(%rsp)
++ movaps %xmm14, 0xc0(%rsp)
++ movaps %xmm15, 0xd0(%rsp)
++.Lecb_dec_body:
++___
++$code.=<<___;
++ mov %rsp,%rbp # backup %rsp
++.cfi_def_cfa_register %rbp
++ mov 240($arg4),%eax # rounds
++ mov $arg1,$inp # backup arguments
++ mov $arg2,$out
++ mov $arg3,$len
++ mov $arg4,$key
++ cmp \$8,$arg3
++ jb .Lecb_dec_short
++
++ mov %eax,%ebx # backup rounds
++ shl \$7,%rax # 128 bytes per inner round key
++ sub \$`128-32`,%rax # size of bit-sliced key schedule
++ sub %rax,%rsp
++ mov %rsp,%rax # pass key schedule
++ mov $key,%rcx # pass key
++ mov %ebx,%r10d # pass rounds
++ call _bsaes_key_convert
++ pxor (%rsp),%xmm7 # fix up 0 round key
++ movdqa %xmm6,(%rax) # save last round key
++ movdqa %xmm7,(%rsp)
++
++ sub \$8,$len
++.Lecb_dec_loop:
++ movdqu 0x00($inp), @XMM[0] # load input
++ movdqu 0x10($inp), @XMM[1]
++ movdqu 0x20($inp), @XMM[2]
++ movdqu 0x30($inp), @XMM[3]
++ movdqu 0x40($inp), @XMM[4]
++ movdqu 0x50($inp), @XMM[5]
++ mov %rsp, %rax # pass key schedule
++ movdqu 0x60($inp), @XMM[6]
++ mov %ebx,%r10d # pass rounds
++ movdqu 0x70($inp), @XMM[7]
++ lea 0x80($inp), $inp
++
++ call _bsaes_decrypt8
++
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[3], 0x60($out)
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++ sub \$8,$len
++ jnc .Lecb_dec_loop
++
++ add \$8,$len
++ jz .Lecb_dec_done
++
++ movdqu 0x00($inp), @XMM[0] # load input
++ mov %rsp, %rax # pass key schedule
++ mov %ebx,%r10d # pass rounds
++ cmp \$2,$len
++ jb .Lecb_dec_one
++ movdqu 0x10($inp), @XMM[1]
++ je .Lecb_dec_two
++ movdqu 0x20($inp), @XMM[2]
++ cmp \$4,$len
++ jb .Lecb_dec_three
++ movdqu 0x30($inp), @XMM[3]
++ je .Lecb_dec_four
++ movdqu 0x40($inp), @XMM[4]
++ cmp \$6,$len
++ jb .Lecb_dec_five
++ movdqu 0x50($inp), @XMM[5]
++ je .Lecb_dec_six
++ movdqu 0x60($inp), @XMM[6]
++ call _bsaes_decrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[3], 0x60($out)
++ jmp .Lecb_dec_done
++.align 16
++.Lecb_dec_six:
++ call _bsaes_decrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ jmp .Lecb_dec_done
++.align 16
++.Lecb_dec_five:
++ call _bsaes_decrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ jmp .Lecb_dec_done
++.align 16
++.Lecb_dec_four:
++ call _bsaes_decrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ jmp .Lecb_dec_done
++.align 16
++.Lecb_dec_three:
++ call _bsaes_decrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ jmp .Lecb_dec_done
++.align 16
++.Lecb_dec_two:
++ call _bsaes_decrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ jmp .Lecb_dec_done
++.align 16
++.Lecb_dec_one:
++ call _bsaes_decrypt8
++ movdqu @XMM[0], 0x00($out) # write output
++ jmp .Lecb_dec_done
++.align 16
++.Lecb_dec_short:
++ lea ($inp), $arg1
++ lea ($out), $arg2
++ lea ($key), $arg3
++ call asm_AES_decrypt
++ lea 16($inp), $inp
++ lea 16($out), $out
++ dec $len
++ jnz .Lecb_dec_short
++
++.Lecb_dec_done:
++ lea (%rsp),%rax
++ pxor %xmm0, %xmm0
++.Lecb_dec_bzero: # wipe key schedule [if any]
++ movdqa %xmm0, 0x00(%rax)
++ movdqa %xmm0, 0x10(%rax)
++ lea 0x20(%rax), %rax
++ cmp %rax, %rbp
++ jb .Lecb_dec_bzero
++
++ lea 0x78(%rbp),%rax
++.cfi_def_cfa %rax,8
++___
++$code.=<<___ if ($win64);
++ movaps 0x40(%rbp), %xmm6
++ movaps 0x50(%rbp), %xmm7
++ movaps 0x60(%rbp), %xmm8
++ movaps 0x70(%rbp), %xmm9
++ movaps 0x80(%rbp), %xmm10
++ movaps 0x90(%rbp), %xmm11
++ movaps 0xa0(%rbp), %xmm12
++ movaps 0xb0(%rbp), %xmm13
++ movaps 0xc0(%rbp), %xmm14
++ movaps 0xd0(%rbp), %xmm15
++ lea 0xa0(%rax), %rax
++.Lecb_dec_tail:
++___
++$code.=<<___;
++ mov -48(%rax), %r15
++.cfi_restore %r15
++ mov -40(%rax), %r14
++.cfi_restore %r14
++ mov -32(%rax), %r13
++.cfi_restore %r13
++ mov -24(%rax), %r12
++.cfi_restore %r12
++ mov -16(%rax), %rbx
++.cfi_restore %rbx
++ mov -8(%rax), %rbp
++.cfi_restore %rbp
++ lea (%rax), %rsp # restore %rsp
++.cfi_def_cfa_register %rsp
++.Lecb_dec_epilogue:
++ ret
++.cfi_endproc
++.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
++___
++}
++$code.=<<___;
++.extern asm_AES_cbc_encrypt
++.globl bsaes_cbc_encrypt
++.type bsaes_cbc_encrypt,\@abi-omnipotent
++.align 16
++bsaes_cbc_encrypt:
++.cfi_startproc
++___
++$code.=<<___ if ($win64);
++ mov 48(%rsp),$arg6 # pull direction flag
++___
++$code.=<<___;
++ cmp \$0,$arg6
++ jne asm_AES_cbc_encrypt
++ cmp \$128,$arg3
++ jb asm_AES_cbc_encrypt
++
++ mov %rsp, %rax
++.Lcbc_dec_prologue:
++ push %rbp
++.cfi_push %rbp
++ push %rbx
++.cfi_push %rbx
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ lea -0x48(%rsp), %rsp
++.cfi_adjust_cfa_offset 0x48
++___
++$code.=<<___ if ($win64);
++ mov 0xa0(%rsp),$arg5 # pull ivp
++ lea -0xa0(%rsp), %rsp
++ movaps %xmm6, 0x40(%rsp)
++ movaps %xmm7, 0x50(%rsp)
++ movaps %xmm8, 0x60(%rsp)
++ movaps %xmm9, 0x70(%rsp)
++ movaps %xmm10, 0x80(%rsp)
++ movaps %xmm11, 0x90(%rsp)
++ movaps %xmm12, 0xa0(%rsp)
++ movaps %xmm13, 0xb0(%rsp)
++ movaps %xmm14, 0xc0(%rsp)
++ movaps %xmm15, 0xd0(%rsp)
++.Lcbc_dec_body:
++___
++$code.=<<___;
++ mov %rsp, %rbp # backup %rsp
++.cfi_def_cfa_register %rbp
++ mov 240($arg4), %eax # rounds
++ mov $arg1, $inp # backup arguments
++ mov $arg2, $out
++ mov $arg3, $len
++ mov $arg4, $key
++ mov $arg5, %rbx
++ shr \$4, $len # bytes to blocks
++
++ mov %eax, %edx # rounds
++ shl \$7, %rax # 128 bytes per inner round key
++ sub \$`128-32`, %rax # size of bit-sliced key schedule
++ sub %rax, %rsp
++
++ mov %rsp, %rax # pass key schedule
++ mov $key, %rcx # pass key
++ mov %edx, %r10d # pass rounds
++ call _bsaes_key_convert
++ pxor (%rsp),%xmm7 # fix up 0 round key
++ movdqa %xmm6,(%rax) # save last round key
++ movdqa %xmm7,(%rsp)
++
++ movdqu (%rbx), @XMM[15] # load IV
++ sub \$8,$len
++.Lcbc_dec_loop:
++ movdqu 0x00($inp), @XMM[0] # load input
++ movdqu 0x10($inp), @XMM[1]
++ movdqu 0x20($inp), @XMM[2]
++ movdqu 0x30($inp), @XMM[3]
++ movdqu 0x40($inp), @XMM[4]
++ movdqu 0x50($inp), @XMM[5]
++ mov %rsp, %rax # pass key schedule
++ movdqu 0x60($inp), @XMM[6]
++ mov %edx,%r10d # pass rounds
++ movdqu 0x70($inp), @XMM[7]
++ movdqa @XMM[15], 0x20(%rbp) # put aside IV
++
++ call _bsaes_decrypt8
++
++ pxor 0x20(%rbp), @XMM[0] # ^= IV
++ movdqu 0x00($inp), @XMM[8] # re-load input
++ movdqu 0x10($inp), @XMM[9]
++ pxor @XMM[8], @XMM[1]
++ movdqu 0x20($inp), @XMM[10]
++ pxor @XMM[9], @XMM[6]
++ movdqu 0x30($inp), @XMM[11]
++ pxor @XMM[10], @XMM[4]
++ movdqu 0x40($inp), @XMM[12]
++ pxor @XMM[11], @XMM[2]
++ movdqu 0x50($inp), @XMM[13]
++ pxor @XMM[12], @XMM[7]
++ movdqu 0x60($inp), @XMM[14]
++ pxor @XMM[13], @XMM[3]
++ movdqu 0x70($inp), @XMM[15] # IV
++ pxor @XMM[14], @XMM[5]
++ movdqu @XMM[0], 0x00($out) # write output
++ lea 0x80($inp), $inp
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[3], 0x60($out)
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++ sub \$8,$len
++ jnc .Lcbc_dec_loop
++
++ add \$8,$len
++ jz .Lcbc_dec_done
++
++ movdqu 0x00($inp), @XMM[0] # load input
++ mov %rsp, %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++ cmp \$2,$len
++ jb .Lcbc_dec_one
++ movdqu 0x10($inp), @XMM[1]
++ je .Lcbc_dec_two
++ movdqu 0x20($inp), @XMM[2]
++ cmp \$4,$len
++ jb .Lcbc_dec_three
++ movdqu 0x30($inp), @XMM[3]
++ je .Lcbc_dec_four
++ movdqu 0x40($inp), @XMM[4]
++ cmp \$6,$len
++ jb .Lcbc_dec_five
++ movdqu 0x50($inp), @XMM[5]
++ je .Lcbc_dec_six
++ movdqu 0x60($inp), @XMM[6]
++ movdqa @XMM[15], 0x20(%rbp) # put aside IV
++ call _bsaes_decrypt8
++ pxor 0x20(%rbp), @XMM[0] # ^= IV
++ movdqu 0x00($inp), @XMM[8] # re-load input
++ movdqu 0x10($inp), @XMM[9]
++ pxor @XMM[8], @XMM[1]
++ movdqu 0x20($inp), @XMM[10]
++ pxor @XMM[9], @XMM[6]
++ movdqu 0x30($inp), @XMM[11]
++ pxor @XMM[10], @XMM[4]
++ movdqu 0x40($inp), @XMM[12]
++ pxor @XMM[11], @XMM[2]
++ movdqu 0x50($inp), @XMM[13]
++ pxor @XMM[12], @XMM[7]
++ movdqu 0x60($inp), @XMM[15] # IV
++ pxor @XMM[13], @XMM[3]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[3], 0x60($out)
++ jmp .Lcbc_dec_done
++.align 16
++.Lcbc_dec_six:
++ movdqa @XMM[15], 0x20(%rbp) # put aside IV
++ call _bsaes_decrypt8
++ pxor 0x20(%rbp), @XMM[0] # ^= IV
++ movdqu 0x00($inp), @XMM[8] # re-load input
++ movdqu 0x10($inp), @XMM[9]
++ pxor @XMM[8], @XMM[1]
++ movdqu 0x20($inp), @XMM[10]
++ pxor @XMM[9], @XMM[6]
++ movdqu 0x30($inp), @XMM[11]
++ pxor @XMM[10], @XMM[4]
++ movdqu 0x40($inp), @XMM[12]
++ pxor @XMM[11], @XMM[2]
++ movdqu 0x50($inp), @XMM[15] # IV
++ pxor @XMM[12], @XMM[7]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ jmp .Lcbc_dec_done
++.align 16
++.Lcbc_dec_five:
++ movdqa @XMM[15], 0x20(%rbp) # put aside IV
++ call _bsaes_decrypt8
++ pxor 0x20(%rbp), @XMM[0] # ^= IV
++ movdqu 0x00($inp), @XMM[8] # re-load input
++ movdqu 0x10($inp), @XMM[9]
++ pxor @XMM[8], @XMM[1]
++ movdqu 0x20($inp), @XMM[10]
++ pxor @XMM[9], @XMM[6]
++ movdqu 0x30($inp), @XMM[11]
++ pxor @XMM[10], @XMM[4]
++ movdqu 0x40($inp), @XMM[15] # IV
++ pxor @XMM[11], @XMM[2]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ jmp .Lcbc_dec_done
++.align 16
++.Lcbc_dec_four:
++ movdqa @XMM[15], 0x20(%rbp) # put aside IV
++ call _bsaes_decrypt8
++ pxor 0x20(%rbp), @XMM[0] # ^= IV
++ movdqu 0x00($inp), @XMM[8] # re-load input
++ movdqu 0x10($inp), @XMM[9]
++ pxor @XMM[8], @XMM[1]
++ movdqu 0x20($inp), @XMM[10]
++ pxor @XMM[9], @XMM[6]
++ movdqu 0x30($inp), @XMM[15] # IV
++ pxor @XMM[10], @XMM[4]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ jmp .Lcbc_dec_done
++.align 16
++.Lcbc_dec_three:
++ movdqa @XMM[15], 0x20(%rbp) # put aside IV
++ call _bsaes_decrypt8
++ pxor 0x20(%rbp), @XMM[0] # ^= IV
++ movdqu 0x00($inp), @XMM[8] # re-load input
++ movdqu 0x10($inp), @XMM[9]
++ pxor @XMM[8], @XMM[1]
++ movdqu 0x20($inp), @XMM[15] # IV
++ pxor @XMM[9], @XMM[6]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ jmp .Lcbc_dec_done
++.align 16
++.Lcbc_dec_two:
++ movdqa @XMM[15], 0x20(%rbp) # put aside IV
++ call _bsaes_decrypt8
++ pxor 0x20(%rbp), @XMM[0] # ^= IV
++ movdqu 0x00($inp), @XMM[8] # re-load input
++ movdqu 0x10($inp), @XMM[15] # IV
++ pxor @XMM[8], @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ jmp .Lcbc_dec_done
++.align 16
++.Lcbc_dec_one:
++ lea ($inp), $arg1
++ lea 0x20(%rbp), $arg2 # buffer output
++ lea ($key), $arg3
++ call asm_AES_decrypt # doesn't touch %xmm
++ pxor 0x20(%rbp), @XMM[15] # ^= IV
++ movdqu @XMM[15], ($out) # write output
++ movdqa @XMM[0], @XMM[15] # IV
++
++.Lcbc_dec_done:
++ movdqu @XMM[15], (%rbx) # return IV
++ lea (%rsp), %rax
++ pxor %xmm0, %xmm0
++.Lcbc_dec_bzero: # wipe key schedule [if any]
++ movdqa %xmm0, 0x00(%rax)
++ movdqa %xmm0, 0x10(%rax)
++ lea 0x20(%rax), %rax
++ cmp %rax, %rbp
++ ja .Lcbc_dec_bzero
++
++ lea 0x78(%rbp),%rax
++.cfi_def_cfa %rax,8
++___
++$code.=<<___ if ($win64);
++ movaps 0x40(%rbp), %xmm6
++ movaps 0x50(%rbp), %xmm7
++ movaps 0x60(%rbp), %xmm8
++ movaps 0x70(%rbp), %xmm9
++ movaps 0x80(%rbp), %xmm10
++ movaps 0x90(%rbp), %xmm11
++ movaps 0xa0(%rbp), %xmm12
++ movaps 0xb0(%rbp), %xmm13
++ movaps 0xc0(%rbp), %xmm14
++ movaps 0xd0(%rbp), %xmm15
++ lea 0xa0(%rax), %rax
++.Lcbc_dec_tail:
++___
++$code.=<<___;
++ mov -48(%rax), %r15
++.cfi_restore %r15
++ mov -40(%rax), %r14
++.cfi_restore %r14
++ mov -32(%rax), %r13
++.cfi_restore %r13
++ mov -24(%rax), %r12
++.cfi_restore %r12
++ mov -16(%rax), %rbx
++.cfi_restore %rbx
++ mov -8(%rax), %rbp
++.cfi_restore %rbp
++ lea (%rax), %rsp # restore %rsp
++.cfi_def_cfa_register %rsp
++.Lcbc_dec_epilogue:
++ ret
++.cfi_endproc
++.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
++
++.globl bsaes_ctr32_encrypt_blocks
++.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
++.align 16
++bsaes_ctr32_encrypt_blocks:
++.cfi_startproc
++ mov %rsp, %rax
++.Lctr_enc_prologue:
++ push %rbp
++.cfi_push %rbp
++ push %rbx
++.cfi_push %rbx
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ lea -0x48(%rsp), %rsp
++.cfi_adjust_cfa_offset 0x48
++___
++$code.=<<___ if ($win64);
++ mov 0xa0(%rsp),$arg5 # pull ivp
++ lea -0xa0(%rsp), %rsp
++ movaps %xmm6, 0x40(%rsp)
++ movaps %xmm7, 0x50(%rsp)
++ movaps %xmm8, 0x60(%rsp)
++ movaps %xmm9, 0x70(%rsp)
++ movaps %xmm10, 0x80(%rsp)
++ movaps %xmm11, 0x90(%rsp)
++ movaps %xmm12, 0xa0(%rsp)
++ movaps %xmm13, 0xb0(%rsp)
++ movaps %xmm14, 0xc0(%rsp)
++ movaps %xmm15, 0xd0(%rsp)
++.Lctr_enc_body:
++___
++$code.=<<___;
++ mov %rsp, %rbp # backup %rsp
++.cfi_def_cfa_register %rbp
++ movdqu ($arg5), %xmm0 # load counter
++ mov 240($arg4), %eax # rounds
++ mov $arg1, $inp # backup arguments
++ mov $arg2, $out
++ mov $arg3, $len
++ mov $arg4, $key
++ movdqa %xmm0, 0x20(%rbp) # copy counter
++ cmp \$8, $arg3
++ jb .Lctr_enc_short
++
++ mov %eax, %ebx # rounds
++ shl \$7, %rax # 128 bytes per inner round key
++ sub \$`128-32`, %rax # size of bit-sliced key schedule
++ sub %rax, %rsp
++
++ mov %rsp, %rax # pass key schedule
++ mov $key, %rcx # pass key
++ mov %ebx, %r10d # pass rounds
++ call _bsaes_key_convert
++ pxor %xmm6,%xmm7 # fix up last round key
++ movdqa %xmm7,(%rax) # save last round key
++
++ movdqa (%rsp), @XMM[9] # load round0 key
++ lea .LADD1(%rip), %r11
++ movdqa 0x20(%rbp), @XMM[0] # counter copy
++ movdqa -0x20(%r11), @XMM[8] # .LSWPUP
++ pshufb @XMM[8], @XMM[9] # byte swap upper part
++ pshufb @XMM[8], @XMM[0]
++ movdqa @XMM[9], (%rsp) # save adjusted round0 key
++ jmp .Lctr_enc_loop
++.align 16
++.Lctr_enc_loop:
++ movdqa @XMM[0], 0x20(%rbp) # save counter
++ movdqa @XMM[0], @XMM[1] # prepare 8 counter values
++ movdqa @XMM[0], @XMM[2]
++ paddd 0x00(%r11), @XMM[1] # .LADD1
++ movdqa @XMM[0], @XMM[3]
++ paddd 0x10(%r11), @XMM[2] # .LADD2
++ movdqa @XMM[0], @XMM[4]
++ paddd 0x20(%r11), @XMM[3] # .LADD3
++ movdqa @XMM[0], @XMM[5]
++ paddd 0x30(%r11), @XMM[4] # .LADD4
++ movdqa @XMM[0], @XMM[6]
++ paddd 0x40(%r11), @XMM[5] # .LADD5
++ movdqa @XMM[0], @XMM[7]
++ paddd 0x50(%r11), @XMM[6] # .LADD6
++ paddd 0x60(%r11), @XMM[7] # .LADD7
++
++ # Borrow prologue from _bsaes_encrypt8 to use the opportunity
++ # to flip byte order in 32-bit counter
++ movdqa (%rsp), @XMM[9] # round 0 key
++ lea 0x10(%rsp), %rax # pass key schedule
++ movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
++ pxor @XMM[9], @XMM[0] # xor with round0 key
++ pxor @XMM[9], @XMM[1]
++ pxor @XMM[9], @XMM[2]
++ pxor @XMM[9], @XMM[3]
++ pshufb @XMM[8], @XMM[0]
++ pshufb @XMM[8], @XMM[1]
++ pxor @XMM[9], @XMM[4]
++ pxor @XMM[9], @XMM[5]
++ pshufb @XMM[8], @XMM[2]
++ pshufb @XMM[8], @XMM[3]
++ pxor @XMM[9], @XMM[6]
++ pxor @XMM[9], @XMM[7]
++ pshufb @XMM[8], @XMM[4]
++ pshufb @XMM[8], @XMM[5]
++ pshufb @XMM[8], @XMM[6]
++ pshufb @XMM[8], @XMM[7]
++ lea .LBS0(%rip), %r11 # constants table
++ mov %ebx,%r10d # pass rounds
++
++ call _bsaes_encrypt8_bitslice
++
++ sub \$8,$len
++ jc .Lctr_enc_loop_done
++
++ movdqu 0x00($inp), @XMM[8] # load input
++ movdqu 0x10($inp), @XMM[9]
++ movdqu 0x20($inp), @XMM[10]
++ movdqu 0x30($inp), @XMM[11]
++ movdqu 0x40($inp), @XMM[12]
++ movdqu 0x50($inp), @XMM[13]
++ movdqu 0x60($inp), @XMM[14]
++ movdqu 0x70($inp), @XMM[15]
++ lea 0x80($inp),$inp
++ pxor @XMM[0], @XMM[8]
++ movdqa 0x20(%rbp), @XMM[0] # load counter
++ pxor @XMM[9], @XMM[1]
++ movdqu @XMM[8], 0x00($out) # write output
++ pxor @XMM[10], @XMM[4]
++ movdqu @XMM[1], 0x10($out)
++ pxor @XMM[11], @XMM[6]
++ movdqu @XMM[4], 0x20($out)
++ pxor @XMM[12], @XMM[3]
++ movdqu @XMM[6], 0x30($out)
++ pxor @XMM[13], @XMM[7]
++ movdqu @XMM[3], 0x40($out)
++ pxor @XMM[14], @XMM[2]
++ movdqu @XMM[7], 0x50($out)
++ pxor @XMM[15], @XMM[5]
++ movdqu @XMM[2], 0x60($out)
++ lea .LADD1(%rip), %r11
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++ paddd 0x70(%r11), @XMM[0] # .LADD8
++ jnz .Lctr_enc_loop
++
++ jmp .Lctr_enc_done
++.align 16
++.Lctr_enc_loop_done:
++ add \$8, $len
++ movdqu 0x00($inp), @XMM[8] # load input
++ pxor @XMM[8], @XMM[0]
++ movdqu @XMM[0], 0x00($out) # write output
++ cmp \$2,$len
++ jb .Lctr_enc_done
++ movdqu 0x10($inp), @XMM[9]
++ pxor @XMM[9], @XMM[1]
++ movdqu @XMM[1], 0x10($out)
++ je .Lctr_enc_done
++ movdqu 0x20($inp), @XMM[10]
++ pxor @XMM[10], @XMM[4]
++ movdqu @XMM[4], 0x20($out)
++ cmp \$4,$len
++ jb .Lctr_enc_done
++ movdqu 0x30($inp), @XMM[11]
++ pxor @XMM[11], @XMM[6]
++ movdqu @XMM[6], 0x30($out)
++ je .Lctr_enc_done
++ movdqu 0x40($inp), @XMM[12]
++ pxor @XMM[12], @XMM[3]
++ movdqu @XMM[3], 0x40($out)
++ cmp \$6,$len
++ jb .Lctr_enc_done
++ movdqu 0x50($inp), @XMM[13]
++ pxor @XMM[13], @XMM[7]
++ movdqu @XMM[7], 0x50($out)
++ je .Lctr_enc_done
++ movdqu 0x60($inp), @XMM[14]
++ pxor @XMM[14], @XMM[2]
++ movdqu @XMM[2], 0x60($out)
++ jmp .Lctr_enc_done
++
++.align 16
++.Lctr_enc_short:
++ lea 0x20(%rbp), $arg1
++ lea 0x30(%rbp), $arg2
++ lea ($key), $arg3
++ call asm_AES_encrypt
++ movdqu ($inp), @XMM[1]
++ lea 16($inp), $inp
++ mov 0x2c(%rbp), %eax # load 32-bit counter
++ bswap %eax
++ pxor 0x30(%rbp), @XMM[1]
++ inc %eax # increment
++ movdqu @XMM[1], ($out)
++ bswap %eax
++ lea 16($out), $out
++ mov %eax, 0x2c(%rsp) # save 32-bit counter
++ dec $len
++ jnz .Lctr_enc_short
++
++.Lctr_enc_done:
++ lea (%rsp), %rax
++ pxor %xmm0, %xmm0
++.Lctr_enc_bzero: # wipe key schedule [if any]
++ movdqa %xmm0, 0x00(%rax)
++ movdqa %xmm0, 0x10(%rax)
++ lea 0x20(%rax), %rax
++ cmp %rax, %rbp
++ ja .Lctr_enc_bzero
++
++ lea 0x78(%rbp),%rax
++.cfi_def_cfa %rax,8
++___
++$code.=<<___ if ($win64);
++ movaps 0x40(%rbp), %xmm6
++ movaps 0x50(%rbp), %xmm7
++ movaps 0x60(%rbp), %xmm8
++ movaps 0x70(%rbp), %xmm9
++ movaps 0x80(%rbp), %xmm10
++ movaps 0x90(%rbp), %xmm11
++ movaps 0xa0(%rbp), %xmm12
++ movaps 0xb0(%rbp), %xmm13
++ movaps 0xc0(%rbp), %xmm14
++ movaps 0xd0(%rbp), %xmm15
++ lea 0xa0(%rax), %rax
++.Lctr_enc_tail:
++___
++$code.=<<___;
++ mov -48(%rax), %r15
++.cfi_restore %r15
++ mov -40(%rax), %r14
++.cfi_restore %r14
++ mov -32(%rax), %r13
++.cfi_restore %r13
++ mov -24(%rax), %r12
++.cfi_restore %r12
++ mov -16(%rax), %rbx
++.cfi_restore %rbx
++ mov -8(%rax), %rbp
++.cfi_restore %rbp
++ lea (%rax), %rsp # restore %rsp
++.cfi_def_cfa_register %rsp
++.Lctr_enc_epilogue:
++ ret
++.cfi_endproc
++.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
++___
++######################################################################
++# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
++# const AES_KEY *key1, const AES_KEY *key2,
++# const unsigned char iv[16]);
++#
++my ($twmask,$twres,$twtmp)=@XMM[13..15];
++$arg6=~s/d$//;
++
++$code.=<<___;
++.globl bsaes_xts_encrypt
++.type bsaes_xts_encrypt,\@abi-omnipotent
++.align 16
++bsaes_xts_encrypt:
++.cfi_startproc
++ mov %rsp, %rax
++.Lxts_enc_prologue:
++ push %rbp
++.cfi_push %rbp
++ push %rbx
++.cfi_push %rbx
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ lea -0x48(%rsp), %rsp
++.cfi_adjust_cfa_offset 0x48
++___
++$code.=<<___ if ($win64);
++ mov 0xa0(%rsp),$arg5 # pull key2
++ mov 0xa8(%rsp),$arg6 # pull ivp
++ lea -0xa0(%rsp), %rsp
++ movaps %xmm6, 0x40(%rsp)
++ movaps %xmm7, 0x50(%rsp)
++ movaps %xmm8, 0x60(%rsp)
++ movaps %xmm9, 0x70(%rsp)
++ movaps %xmm10, 0x80(%rsp)
++ movaps %xmm11, 0x90(%rsp)
++ movaps %xmm12, 0xa0(%rsp)
++ movaps %xmm13, 0xb0(%rsp)
++ movaps %xmm14, 0xc0(%rsp)
++ movaps %xmm15, 0xd0(%rsp)
++.Lxts_enc_body:
++___
++$code.=<<___;
++ mov %rsp, %rbp # backup %rsp
++.cfi_def_cfa_register %rbp
++ mov $arg1, $inp # backup arguments
++ mov $arg2, $out
++ mov $arg3, $len
++ mov $arg4, $key
++
++ lea ($arg6), $arg1
++ lea 0x20(%rbp), $arg2
++ lea ($arg5), $arg3
++ call asm_AES_encrypt # generate initial tweak
++
++ mov 240($key), %eax # rounds
++ mov $len, %rbx # backup $len
++
++ mov %eax, %edx # rounds
++ shl \$7, %rax # 128 bytes per inner round key
++ sub \$`128-32`, %rax # size of bit-sliced key schedule
++ sub %rax, %rsp
++
++ mov %rsp, %rax # pass key schedule
++ mov $key, %rcx # pass key
++ mov %edx, %r10d # pass rounds
++ call _bsaes_key_convert
++ pxor %xmm6, %xmm7 # fix up last round key
++ movdqa %xmm7, (%rax) # save last round key
++
++ and \$-16, $len
++ sub \$0x80, %rsp # place for tweak[8]
++ movdqa 0x20(%rbp), @XMM[7] # initial tweak
++
++ pxor $twtmp, $twtmp
++ movdqa .Lxts_magic(%rip), $twmask
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++
++ sub \$0x80, $len
++ jc .Lxts_enc_short
++ jmp .Lxts_enc_loop
++
++.align 16
++.Lxts_enc_loop:
++___
++ for ($i=0;$i<7;$i++) {
++ $code.=<<___;
++ pshufd \$0x13, $twtmp, $twres
++ pxor $twtmp, $twtmp
++ movdqa @XMM[7], @XMM[$i]
++ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
++ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
++ pand $twmask, $twres # isolate carry and residue
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++ pxor $twres, @XMM[7]
++___
++ $code.=<<___ if ($i>=1);
++ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
++___
++ $code.=<<___ if ($i>=2);
++ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
++___
++ }
++$code.=<<___;
++ movdqu 0x60($inp), @XMM[8+6]
++ pxor @XMM[8+5], @XMM[5]
++ movdqu 0x70($inp), @XMM[8+7]
++ lea 0x80($inp), $inp
++ movdqa @XMM[7], 0x70(%rsp)
++ pxor @XMM[8+6], @XMM[6]
++ lea 0x80(%rsp), %rax # pass key schedule
++ pxor @XMM[8+7], @XMM[7]
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_encrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[4]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[6]
++ movdqu @XMM[4], 0x20($out)
++ pxor 0x40(%rsp), @XMM[3]
++ movdqu @XMM[6], 0x30($out)
++ pxor 0x50(%rsp), @XMM[7]
++ movdqu @XMM[3], 0x40($out)
++ pxor 0x60(%rsp), @XMM[2]
++ movdqu @XMM[7], 0x50($out)
++ pxor 0x70(%rsp), @XMM[5]
++ movdqu @XMM[2], 0x60($out)
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++
++ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
++ pxor $twtmp, $twtmp
++ movdqa .Lxts_magic(%rip), $twmask
++ pcmpgtd @XMM[7], $twtmp
++ pshufd \$0x13, $twtmp, $twres
++ pxor $twtmp, $twtmp
++ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
++ pand $twmask, $twres # isolate carry and residue
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++ pxor $twres, @XMM[7]
++
++ sub \$0x80,$len
++ jnc .Lxts_enc_loop
++
++.Lxts_enc_short:
++ add \$0x80, $len
++ jz .Lxts_enc_done
++___
++ for ($i=0;$i<7;$i++) {
++ $code.=<<___;
++ pshufd \$0x13, $twtmp, $twres
++ pxor $twtmp, $twtmp
++ movdqa @XMM[7], @XMM[$i]
++ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
++ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
++ pand $twmask, $twres # isolate carry and residue
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++ pxor $twres, @XMM[7]
++___
++ $code.=<<___ if ($i>=1);
++ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
++ cmp \$`0x10*$i`,$len
++ je .Lxts_enc_$i
++___
++ $code.=<<___ if ($i>=2);
++ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
++___
++ }
++$code.=<<___;
++ movdqu 0x60($inp), @XMM[8+6]
++ pxor @XMM[8+5], @XMM[5]
++ movdqa @XMM[7], 0x70(%rsp)
++ lea 0x70($inp), $inp
++ pxor @XMM[8+6], @XMM[6]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_encrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[4]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[6]
++ movdqu @XMM[4], 0x20($out)
++ pxor 0x40(%rsp), @XMM[3]
++ movdqu @XMM[6], 0x30($out)
++ pxor 0x50(%rsp), @XMM[7]
++ movdqu @XMM[3], 0x40($out)
++ pxor 0x60(%rsp), @XMM[2]
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[2], 0x60($out)
++ lea 0x70($out), $out
++
++ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_enc_done
++.align 16
++.Lxts_enc_6:
++ pxor @XMM[8+4], @XMM[4]
++ lea 0x60($inp), $inp
++ pxor @XMM[8+5], @XMM[5]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_encrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[4]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[6]
++ movdqu @XMM[4], 0x20($out)
++ pxor 0x40(%rsp), @XMM[3]
++ movdqu @XMM[6], 0x30($out)
++ pxor 0x50(%rsp), @XMM[7]
++ movdqu @XMM[3], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ lea 0x60($out), $out
++
++ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_enc_done
++.align 16
++.Lxts_enc_5:
++ pxor @XMM[8+3], @XMM[3]
++ lea 0x50($inp), $inp
++ pxor @XMM[8+4], @XMM[4]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_encrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[4]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[6]
++ movdqu @XMM[4], 0x20($out)
++ pxor 0x40(%rsp), @XMM[3]
++ movdqu @XMM[6], 0x30($out)
++ movdqu @XMM[3], 0x40($out)
++ lea 0x50($out), $out
++
++ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_enc_done
++.align 16
++.Lxts_enc_4:
++ pxor @XMM[8+2], @XMM[2]
++ lea 0x40($inp), $inp
++ pxor @XMM[8+3], @XMM[3]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_encrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[4]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[6]
++ movdqu @XMM[4], 0x20($out)
++ movdqu @XMM[6], 0x30($out)
++ lea 0x40($out), $out
++
++ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_enc_done
++.align 16
++.Lxts_enc_3:
++ pxor @XMM[8+1], @XMM[1]
++ lea 0x30($inp), $inp
++ pxor @XMM[8+2], @XMM[2]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_encrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[4]
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[4], 0x20($out)
++ lea 0x30($out), $out
++
++ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_enc_done
++.align 16
++.Lxts_enc_2:
++ pxor @XMM[8+0], @XMM[0]
++ lea 0x20($inp), $inp
++ pxor @XMM[8+1], @XMM[1]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_encrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ lea 0x20($out), $out
++
++ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_enc_done
++.align 16
++.Lxts_enc_1:
++ pxor @XMM[0], @XMM[8]
++ lea 0x10($inp), $inp
++ movdqa @XMM[8], 0x20(%rbp)
++ lea 0x20(%rbp), $arg1
++ lea 0x20(%rbp), $arg2
++ lea ($key), $arg3
++ call asm_AES_encrypt # doesn't touch %xmm
++ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
++ #pxor @XMM[8], @XMM[0]
++ #lea 0x80(%rsp), %rax # pass key schedule
++ #mov %edx, %r10d # pass rounds
++ #call _bsaes_encrypt8
++ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ movdqu @XMM[0], 0x00($out) # write output
++ lea 0x10($out), $out
++
++ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
++
++.Lxts_enc_done:
++ and \$15, %ebx
++ jz .Lxts_enc_ret
++ mov $out, %rdx
++
++.Lxts_enc_steal:
++ movzb ($inp), %eax
++ movzb -16(%rdx), %ecx
++ lea 1($inp), $inp
++ mov %al, -16(%rdx)
++ mov %cl, 0(%rdx)
++ lea 1(%rdx), %rdx
++ sub \$1,%ebx
++ jnz .Lxts_enc_steal
++
++ movdqu -16($out), @XMM[0]
++ lea 0x20(%rbp), $arg1
++ pxor @XMM[7], @XMM[0]
++ lea 0x20(%rbp), $arg2
++ movdqa @XMM[0], 0x20(%rbp)
++ lea ($key), $arg3
++ call asm_AES_encrypt # doesn't touch %xmm
++ pxor 0x20(%rbp), @XMM[7]
++ movdqu @XMM[7], -16($out)
++
++.Lxts_enc_ret:
++ lea (%rsp), %rax
++ pxor %xmm0, %xmm0
++.Lxts_enc_bzero: # wipe key schedule [if any]
++ movdqa %xmm0, 0x00(%rax)
++ movdqa %xmm0, 0x10(%rax)
++ lea 0x20(%rax), %rax
++ cmp %rax, %rbp
++ ja .Lxts_enc_bzero
++
++ lea 0x78(%rbp),%rax
++.cfi_def_cfa %rax,8
++___
++$code.=<<___ if ($win64);
++ movaps 0x40(%rbp), %xmm6
++ movaps 0x50(%rbp), %xmm7
++ movaps 0x60(%rbp), %xmm8
++ movaps 0x70(%rbp), %xmm9
++ movaps 0x80(%rbp), %xmm10
++ movaps 0x90(%rbp), %xmm11
++ movaps 0xa0(%rbp), %xmm12
++ movaps 0xb0(%rbp), %xmm13
++ movaps 0xc0(%rbp), %xmm14
++ movaps 0xd0(%rbp), %xmm15
++ lea 0xa0(%rax), %rax
++.Lxts_enc_tail:
++___
++$code.=<<___;
++ mov -48(%rax), %r15
++.cfi_restore %r15
++ mov -40(%rax), %r14
++.cfi_restore %r14
++ mov -32(%rax), %r13
++.cfi_restore %r13
++ mov -24(%rax), %r12
++.cfi_restore %r12
++ mov -16(%rax), %rbx
++.cfi_restore %rbx
++ mov -8(%rax), %rbp
++.cfi_restore %rbp
++ lea (%rax), %rsp # restore %rsp
++.cfi_def_cfa_register %rsp
++.Lxts_enc_epilogue:
++ ret
++.cfi_endproc
++.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
++
++.globl bsaes_xts_decrypt
++.type bsaes_xts_decrypt,\@abi-omnipotent
++.align 16
++bsaes_xts_decrypt:
++.cfi_startproc
++ mov %rsp, %rax
++.Lxts_dec_prologue:
++ push %rbp
++.cfi_push %rbp
++ push %rbx
++.cfi_push %rbx
++ push %r12
++.cfi_push %r12
++ push %r13
++.cfi_push %r13
++ push %r14
++.cfi_push %r14
++ push %r15
++.cfi_push %r15
++ lea -0x48(%rsp), %rsp
++.cfi_adjust_cfa_offset 0x48
++___
++$code.=<<___ if ($win64);
++ mov 0xa0(%rsp),$arg5 # pull key2
++ mov 0xa8(%rsp),$arg6 # pull ivp
++ lea -0xa0(%rsp), %rsp
++ movaps %xmm6, 0x40(%rsp)
++ movaps %xmm7, 0x50(%rsp)
++ movaps %xmm8, 0x60(%rsp)
++ movaps %xmm9, 0x70(%rsp)
++ movaps %xmm10, 0x80(%rsp)
++ movaps %xmm11, 0x90(%rsp)
++ movaps %xmm12, 0xa0(%rsp)
++ movaps %xmm13, 0xb0(%rsp)
++ movaps %xmm14, 0xc0(%rsp)
++ movaps %xmm15, 0xd0(%rsp)
++.Lxts_dec_body:
++___
++$code.=<<___;
++ mov %rsp, %rbp # backup %rsp
++ mov $arg1, $inp # backup arguments
++ mov $arg2, $out
++ mov $arg3, $len
++ mov $arg4, $key
++
++ lea ($arg6), $arg1
++ lea 0x20(%rbp), $arg2
++ lea ($arg5), $arg3
++ call asm_AES_encrypt # generate initial tweak
++
++ mov 240($key), %eax # rounds
++ mov $len, %rbx # backup $len
++
++ mov %eax, %edx # rounds
++ shl \$7, %rax # 128 bytes per inner round key
++ sub \$`128-32`, %rax # size of bit-sliced key schedule
++ sub %rax, %rsp
++
++ mov %rsp, %rax # pass key schedule
++ mov $key, %rcx # pass key
++ mov %edx, %r10d # pass rounds
++ call _bsaes_key_convert
++ pxor (%rsp), %xmm7 # fix up round 0 key
++ movdqa %xmm6, (%rax) # save last round key
++ movdqa %xmm7, (%rsp)
++
++ xor %eax, %eax # if ($len%16) len-=16;
++ and \$-16, $len
++ test \$15, %ebx
++ setnz %al
++ shl \$4, %rax
++ sub %rax, $len
++
++ sub \$0x80, %rsp # place for tweak[8]
++ movdqa 0x20(%rbp), @XMM[7] # initial tweak
++
++ pxor $twtmp, $twtmp
++ movdqa .Lxts_magic(%rip), $twmask
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++
++ sub \$0x80, $len
++ jc .Lxts_dec_short
++ jmp .Lxts_dec_loop
++
++.align 16
++.Lxts_dec_loop:
++___
++ for ($i=0;$i<7;$i++) {
++ $code.=<<___;
++ pshufd \$0x13, $twtmp, $twres
++ pxor $twtmp, $twtmp
++ movdqa @XMM[7], @XMM[$i]
++ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
++ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
++ pand $twmask, $twres # isolate carry and residue
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++ pxor $twres, @XMM[7]
++___
++ $code.=<<___ if ($i>=1);
++ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
++___
++ $code.=<<___ if ($i>=2);
++ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
++___
++ }
++$code.=<<___;
++ movdqu 0x60($inp), @XMM[8+6]
++ pxor @XMM[8+5], @XMM[5]
++ movdqu 0x70($inp), @XMM[8+7]
++ lea 0x80($inp), $inp
++ movdqa @XMM[7], 0x70(%rsp)
++ pxor @XMM[8+6], @XMM[6]
++ lea 0x80(%rsp), %rax # pass key schedule
++ pxor @XMM[8+7], @XMM[7]
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_decrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[6]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[4]
++ movdqu @XMM[6], 0x20($out)
++ pxor 0x40(%rsp), @XMM[2]
++ movdqu @XMM[4], 0x30($out)
++ pxor 0x50(%rsp), @XMM[7]
++ movdqu @XMM[2], 0x40($out)
++ pxor 0x60(%rsp), @XMM[3]
++ movdqu @XMM[7], 0x50($out)
++ pxor 0x70(%rsp), @XMM[5]
++ movdqu @XMM[3], 0x60($out)
++ movdqu @XMM[5], 0x70($out)
++ lea 0x80($out), $out
++
++ movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
++ pxor $twtmp, $twtmp
++ movdqa .Lxts_magic(%rip), $twmask
++ pcmpgtd @XMM[7], $twtmp
++ pshufd \$0x13, $twtmp, $twres
++ pxor $twtmp, $twtmp
++ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
++ pand $twmask, $twres # isolate carry and residue
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++ pxor $twres, @XMM[7]
++
++ sub \$0x80,$len
++ jnc .Lxts_dec_loop
++
++.Lxts_dec_short:
++ add \$0x80, $len
++ jz .Lxts_dec_done
++___
++ for ($i=0;$i<7;$i++) {
++ $code.=<<___;
++ pshufd \$0x13, $twtmp, $twres
++ pxor $twtmp, $twtmp
++ movdqa @XMM[7], @XMM[$i]
++ movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
++ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
++ pand $twmask, $twres # isolate carry and residue
++ pcmpgtd @XMM[7], $twtmp # broadcast upper bits
++ pxor $twres, @XMM[7]
++___
++ $code.=<<___ if ($i>=1);
++ movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
++ cmp \$`0x10*$i`,$len
++ je .Lxts_dec_$i
++___
++ $code.=<<___ if ($i>=2);
++ pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
++___
++ }
++$code.=<<___;
++ movdqu 0x60($inp), @XMM[8+6]
++ pxor @XMM[8+5], @XMM[5]
++ movdqa @XMM[7], 0x70(%rsp)
++ lea 0x70($inp), $inp
++ pxor @XMM[8+6], @XMM[6]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_decrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[6]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[4]
++ movdqu @XMM[6], 0x20($out)
++ pxor 0x40(%rsp), @XMM[2]
++ movdqu @XMM[4], 0x30($out)
++ pxor 0x50(%rsp), @XMM[7]
++ movdqu @XMM[2], 0x40($out)
++ pxor 0x60(%rsp), @XMM[3]
++ movdqu @XMM[7], 0x50($out)
++ movdqu @XMM[3], 0x60($out)
++ lea 0x70($out), $out
++
++ movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_dec_done
++.align 16
++.Lxts_dec_6:
++ pxor @XMM[8+4], @XMM[4]
++ lea 0x60($inp), $inp
++ pxor @XMM[8+5], @XMM[5]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_decrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[6]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[4]
++ movdqu @XMM[6], 0x20($out)
++ pxor 0x40(%rsp), @XMM[2]
++ movdqu @XMM[4], 0x30($out)
++ pxor 0x50(%rsp), @XMM[7]
++ movdqu @XMM[2], 0x40($out)
++ movdqu @XMM[7], 0x50($out)
++ lea 0x60($out), $out
++
++ movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_dec_done
++.align 16
++.Lxts_dec_5:
++ pxor @XMM[8+3], @XMM[3]
++ lea 0x50($inp), $inp
++ pxor @XMM[8+4], @XMM[4]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_decrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[6]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[4]
++ movdqu @XMM[6], 0x20($out)
++ pxor 0x40(%rsp), @XMM[2]
++ movdqu @XMM[4], 0x30($out)
++ movdqu @XMM[2], 0x40($out)
++ lea 0x50($out), $out
++
++ movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_dec_done
++.align 16
++.Lxts_dec_4:
++ pxor @XMM[8+2], @XMM[2]
++ lea 0x40($inp), $inp
++ pxor @XMM[8+3], @XMM[3]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_decrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[6]
++ movdqu @XMM[1], 0x10($out)
++ pxor 0x30(%rsp), @XMM[4]
++ movdqu @XMM[6], 0x20($out)
++ movdqu @XMM[4], 0x30($out)
++ lea 0x40($out), $out
++
++ movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_dec_done
++.align 16
++.Lxts_dec_3:
++ pxor @XMM[8+1], @XMM[1]
++ lea 0x30($inp), $inp
++ pxor @XMM[8+2], @XMM[2]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_decrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ pxor 0x20(%rsp), @XMM[6]
++ movdqu @XMM[1], 0x10($out)
++ movdqu @XMM[6], 0x20($out)
++ lea 0x30($out), $out
++
++ movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_dec_done
++.align 16
++.Lxts_dec_2:
++ pxor @XMM[8+0], @XMM[0]
++ lea 0x20($inp), $inp
++ pxor @XMM[8+1], @XMM[1]
++ lea 0x80(%rsp), %rax # pass key schedule
++ mov %edx, %r10d # pass rounds
++
++ call _bsaes_decrypt8
++
++ pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ pxor 0x10(%rsp), @XMM[1]
++ movdqu @XMM[0], 0x00($out) # write output
++ movdqu @XMM[1], 0x10($out)
++ lea 0x20($out), $out
++
++ movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
++ jmp .Lxts_dec_done
++.align 16
++.Lxts_dec_1:
++ pxor @XMM[0], @XMM[8]
++ lea 0x10($inp), $inp
++ movdqa @XMM[8], 0x20(%rbp)
++ lea 0x20(%rbp), $arg1
++ lea 0x20(%rbp), $arg2
++ lea ($key), $arg3
++ call asm_AES_decrypt # doesn't touch %xmm
++ pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
++ #pxor @XMM[8], @XMM[0]
++ #lea 0x80(%rsp), %rax # pass key schedule
++ #mov %edx, %r10d # pass rounds
++ #call _bsaes_decrypt8
++ #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
++ movdqu @XMM[0], 0x00($out) # write output
++ lea 0x10($out), $out
++
++ movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
++
++.Lxts_dec_done:
++ and \$15, %ebx
++ jz .Lxts_dec_ret
++
++ pxor $twtmp, $twtmp
++ movdqa .Lxts_magic(%rip), $twmask
++ pcmpgtd @XMM[7], $twtmp
++ pshufd \$0x13, $twtmp, $twres
++ movdqa @XMM[7], @XMM[6]
++ paddq @XMM[7], @XMM[7] # psllq 1,$tweak
++ pand $twmask, $twres # isolate carry and residue
++ movdqu ($inp), @XMM[0]
++ pxor $twres, @XMM[7]
++
++ lea 0x20(%rbp), $arg1
++ pxor @XMM[7], @XMM[0]
++ lea 0x20(%rbp), $arg2
++ movdqa @XMM[0], 0x20(%rbp)
++ lea ($key), $arg3
++ call asm_AES_decrypt # doesn't touch %xmm
++ pxor 0x20(%rbp), @XMM[7]
++ mov $out, %rdx
++ movdqu @XMM[7], ($out)
++
++.Lxts_dec_steal:
++ movzb 16($inp), %eax
++ movzb (%rdx), %ecx
++ lea 1($inp), $inp
++ mov %al, (%rdx)
++ mov %cl, 16(%rdx)
++ lea 1(%rdx), %rdx
++ sub \$1,%ebx
++ jnz .Lxts_dec_steal
++
++ movdqu ($out), @XMM[0]
++ lea 0x20(%rbp), $arg1
++ pxor @XMM[6], @XMM[0]
++ lea 0x20(%rbp), $arg2
++ movdqa @XMM[0], 0x20(%rbp)
++ lea ($key), $arg3
++ call asm_AES_decrypt # doesn't touch %xmm
++ pxor 0x20(%rbp), @XMM[6]
++ movdqu @XMM[6], ($out)
++
++.Lxts_dec_ret:
++ lea (%rsp), %rax
++ pxor %xmm0, %xmm0
++.Lxts_dec_bzero: # wipe key schedule [if any]
++ movdqa %xmm0, 0x00(%rax)
++ movdqa %xmm0, 0x10(%rax)
++ lea 0x20(%rax), %rax
++ cmp %rax, %rbp
++ ja .Lxts_dec_bzero
++
++ lea 0x78(%rbp),%rax
++.cfi_def_cfa %rax,8
++___
++$code.=<<___ if ($win64);
++ movaps 0x40(%rbp), %xmm6
++ movaps 0x50(%rbp), %xmm7
++ movaps 0x60(%rbp), %xmm8
++ movaps 0x70(%rbp), %xmm9
++ movaps 0x80(%rbp), %xmm10
++ movaps 0x90(%rbp), %xmm11
++ movaps 0xa0(%rbp), %xmm12
++ movaps 0xb0(%rbp), %xmm13
++ movaps 0xc0(%rbp), %xmm14
++ movaps 0xd0(%rbp), %xmm15
++ lea 0xa0(%rax), %rax
++.Lxts_dec_tail:
++___
++$code.=<<___;
++ mov -48(%rax), %r15
++.cfi_restore %r15
++ mov -40(%rax), %r14
++.cfi_restore %r14
++ mov -32(%rax), %r13
++.cfi_restore %r13
++ mov -24(%rax), %r12
++.cfi_restore %r12
++ mov -16(%rax), %rbx
++.cfi_restore %rbx
++ mov -8(%rax), %rbp
++.cfi_restore %rbp
++ lea (%rax), %rsp # restore %rsp
++.cfi_def_cfa_register %rsp
++.Lxts_dec_epilogue:
++ ret
++.cfi_endproc
++.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
++___
++}
++$code.=<<___;
++.type _bsaes_const,\@object
++.align 64
++_bsaes_const:
++.LM0ISR: # InvShiftRows constants
++ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
++.LISRM0:
++ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
++.LISR:
++ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
++.LBS0: # bit-slice constants
++ .quad 0x5555555555555555, 0x5555555555555555
++.LBS1:
++ .quad 0x3333333333333333, 0x3333333333333333
++.LBS2:
++ .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
++.LSR: # shiftrows constants
++ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
++.LSRM0:
++ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
++.LM0SR:
++ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
++.LSWPUP: # byte-swap upper dword
++ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
++.LSWPUPM0SR:
++ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
++.LADD1: # counter increment constants
++ .quad 0x0000000000000000, 0x0000000100000000
++.LADD2:
++ .quad 0x0000000000000000, 0x0000000200000000
++.LADD3:
++ .quad 0x0000000000000000, 0x0000000300000000
++.LADD4:
++ .quad 0x0000000000000000, 0x0000000400000000
++.LADD5:
++ .quad 0x0000000000000000, 0x0000000500000000
++.LADD6:
++ .quad 0x0000000000000000, 0x0000000600000000
++.LADD7:
++ .quad 0x0000000000000000, 0x0000000700000000
++.LADD8:
++ .quad 0x0000000000000000, 0x0000000800000000
++.Lxts_magic:
++ .long 0x87,0,1,0
++.Lmasks:
++ .quad 0x0101010101010101, 0x0101010101010101
++ .quad 0x0202020202020202, 0x0202020202020202
++ .quad 0x0404040404040404, 0x0404040404040404
++ .quad 0x0808080808080808, 0x0808080808080808
++.LM0:
++ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
++.L63:
++ .quad 0x6363636363636363, 0x6363636363636363
++.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
++.align 64
++.size _bsaes_const,.-_bsaes_const
++___
++
++# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
++# CONTEXT *context,DISPATCHER_CONTEXT *disp)
++if ($win64) {
++$rec="%rcx";
++$frame="%rdx";
++$context="%r8";
++$disp="%r9";
++
++$code.=<<___;
++.extern __imp_RtlVirtualUnwind
++.type se_handler,\@abi-omnipotent
++.align 16
++se_handler:
++ push %rsi
++ push %rdi
++ push %rbx
++ push %rbp
++ push %r12
++ push %r13
++ push %r14
++ push %r15
++ pushfq
++ sub \$64,%rsp
++
++ mov 120($context),%rax # pull context->Rax
++ mov 248($context),%rbx # pull context->Rip
++
++ mov 8($disp),%rsi # disp->ImageBase
++ mov 56($disp),%r11 # disp->HandlerData
++
++ mov 0(%r11),%r10d # HandlerData[0]
++ lea (%rsi,%r10),%r10 # prologue label
++ cmp %r10,%rbx # context->Rip<=prologue label
++ jbe .Lin_prologue
++
++ mov 4(%r11),%r10d # HandlerData[1]
++ lea (%rsi,%r10),%r10 # epilogue label
++ cmp %r10,%rbx # context->Rip>=epilogue label
++ jae .Lin_prologue
++
++ mov 8(%r11),%r10d # HandlerData[2]
++ lea (%rsi,%r10),%r10 # epilogue label
++ cmp %r10,%rbx # context->Rip>=tail label
++ jae .Lin_tail
++
++ mov 160($context),%rax # pull context->Rbp
++
++ lea 0x40(%rax),%rsi # %xmm save area
++ lea 512($context),%rdi # &context.Xmm6
++ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
++ .long 0xa548f3fc # cld; rep movsq
++ lea 0xa0+0x78(%rax),%rax # adjust stack pointer
++
++.Lin_tail:
++ mov -48(%rax),%rbp
++ mov -40(%rax),%rbx
++ mov -32(%rax),%r12
++ mov -24(%rax),%r13
++ mov -16(%rax),%r14
++ mov -8(%rax),%r15
++ mov %rbx,144($context) # restore context->Rbx
++ mov %rbp,160($context) # restore context->Rbp
++ mov %r12,216($context) # restore context->R12
++ mov %r13,224($context) # restore context->R13
++ mov %r14,232($context) # restore context->R14
++ mov %r15,240($context) # restore context->R15
++
++.Lin_prologue:
++ mov %rax,152($context) # restore context->Rsp
++
++ mov 40($disp),%rdi # disp->ContextRecord
++ mov $context,%rsi # context
++ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
++ .long 0xa548f3fc # cld; rep movsq
++
++ mov $disp,%rsi
++ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
++ mov 8(%rsi),%rdx # arg2, disp->ImageBase
++ mov 0(%rsi),%r8 # arg3, disp->ControlPc
++ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
++ mov 40(%rsi),%r10 # disp->ContextRecord
++ lea 56(%rsi),%r11 # &disp->HandlerData
++ lea 24(%rsi),%r12 # &disp->EstablisherFrame
++ mov %r10,32(%rsp) # arg5
++ mov %r11,40(%rsp) # arg6
++ mov %r12,48(%rsp) # arg7
++ mov %rcx,56(%rsp) # arg8, (NULL)
++ call *__imp_RtlVirtualUnwind(%rip)
++
++ mov \$1,%eax # ExceptionContinueSearch
++ add \$64,%rsp
++ popfq
++ pop %r15
++ pop %r14
++ pop %r13
++ pop %r12
++ pop %rbp
++ pop %rbx
++ pop %rdi
++ pop %rsi
++ ret
++.size se_handler,.-se_handler
++
++.section .pdata
++.align 4
++___
++$code.=<<___ if ($ecb);
++ .rva .Lecb_enc_prologue
++ .rva .Lecb_enc_epilogue
++ .rva .Lecb_enc_info
++
++ .rva .Lecb_dec_prologue
++ .rva .Lecb_dec_epilogue
++ .rva .Lecb_dec_info
++___
++$code.=<<___;
++ .rva .Lcbc_dec_prologue
++ .rva .Lcbc_dec_epilogue
++ .rva .Lcbc_dec_info
++
++ .rva .Lctr_enc_prologue
++ .rva .Lctr_enc_epilogue
++ .rva .Lctr_enc_info
++
++ .rva .Lxts_enc_prologue
++ .rva .Lxts_enc_epilogue
++ .rva .Lxts_enc_info
++
++ .rva .Lxts_dec_prologue
++ .rva .Lxts_dec_epilogue
++ .rva .Lxts_dec_info
++
++.section .xdata
++.align 8
++___
++$code.=<<___ if ($ecb);
++.Lecb_enc_info:
++ .byte 9,0,0,0
++ .rva se_handler
++ .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
++ .rva .Lecb_enc_tail
++ .long 0
++.Lecb_dec_info:
++ .byte 9,0,0,0
++ .rva se_handler
++ .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
++ .rva .Lecb_dec_tail
++ .long 0
++___
++$code.=<<___;
++.Lcbc_dec_info:
++ .byte 9,0,0,0
++ .rva se_handler
++ .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
++ .rva .Lcbc_dec_tail
++ .long 0
++.Lctr_enc_info:
++ .byte 9,0,0,0
++ .rva se_handler
++ .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
++ .rva .Lctr_enc_tail
++ .long 0
++.Lxts_enc_info:
++ .byte 9,0,0,0
++ .rva se_handler
++ .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
++ .rva .Lxts_enc_tail
++ .long 0
++.Lxts_dec_info:
++ .byte 9,0,0,0
++ .rva se_handler
++ .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
++ .rva .Lxts_dec_tail
++ .long 0
++___
++}
++
++$code =~ s/\`([^\`]*)\`/eval($1)/gem;
++
++print $code;
++
++close STDOUT;