Merge branch 'for-linus/i2c-2638' of git://git.fluff.org/bjdooks/linux

[pandora-kernel.git] / arch / x86 / crypto / aesni-intel_asm.S
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S

index ff16756..8fe2a49 100644 (file)
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
   *            Vinodh Gopal <vinodh.gopal@intel.com>
   *            Kahraman Akdemir
   *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
+ * Ported x86_64 version to x86:
+ *    Author: Mathias Krause <minipli@googlemail.com>
+ *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
  #include <linux/linkage.h>
  #include <asm/inst.h>
  
+#ifdef __x86_64__
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and ZERO should follow ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+
+
  .text
  
+
+#define        STACK_OFFSET    8*3
+#define        HashKey         16*0    // store HashKey <<1 mod poly here
+#define        HashKey_2       16*1    // store HashKey^2 <<1 mod poly here
+#define        HashKey_3       16*2    // store HashKey^3 <<1 mod poly here
+#define        HashKey_4       16*3    // store HashKey^4 <<1 mod poly here
+#define        HashKey_k       16*4    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey <<1 mod poly here
+                               //(for Karatsuba purposes)
+#define        HashKey_2_k     16*5    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey^2 <<1 mod poly here
+                               // (for Karatsuba purposes)
+#define        HashKey_3_k     16*6    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey^3 <<1 mod poly here
+                               // (for Karatsuba purposes)
+#define        HashKey_4_k     16*7    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey^4 <<1 mod poly here
+                               // (for Karatsuba purposes)
+#define        VARIABLE_OFFSET 16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+#endif
+
+
  #define STATE1 %xmm0
  #define STATE2 %xmm4
  #define STATE3 %xmm5
@@ -32,12 +100,16 @@
  #define IN     IN1
  #define KEY    %xmm2
  #define IV     %xmm3
+
  #define BSWAP_MASK %xmm10
  #define CTR    %xmm11
  #define INC    %xmm12
  
+#ifdef __x86_64__
+#define AREG   %rax
  #define KEYP   %rdi
  #define OUTP   %rsi
+#define UKEYP  OUTP
  #define INP    %rdx
  #define LEN    %rcx
  #define IVP    %r8
@@ -46,6 +118,1588 @@
  #define TKEYP  T1
  #define T2     %r11
  #define TCTR_LOW T2
+#else
+#define AREG   %eax
+#define KEYP   %edi
+#define OUTP   AREG
+#define UKEYP  OUTP
+#define INP    %edx
+#define LEN    %esi
+#define IVP    %ebp
+#define KLEN   %ebx
+#define T1     %ecx
+#define TKEYP  T1
+#endif
+
+
+#ifdef __x86_64__
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+       movdqa    \GH, \TMP1
+       pshufd    $78, \GH, \TMP2
+       pshufd    $78, \HK, \TMP3
+       pxor      \GH, \TMP2            # TMP2 = a1+a0
+       pxor      \HK, \TMP3            # TMP3 = b1+b0
+       PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
+       PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
+       PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+       pxor      \GH, \TMP2
+       pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3             # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2             # right shift TMP2 2 DWs
+       pxor      \TMP3, \GH
+       pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+       movdqa    \GH, \TMP2
+       movdqa    \GH, \TMP3
+       movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
+                                       # in in order to perform
+                                       # independent shifts
+       pslld     $31, \TMP2            # packed right shift <<31
+       pslld     $30, \TMP3            # packed right shift <<30
+       pslld     $25, \TMP4            # packed right shift <<25
+       pxor      \TMP3, \TMP2          # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5             # right shift TMP5 1 DW
+       pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+       pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+       movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
+                                       # in in order to perform
+                                       # independent shifts
+       movdqa    \GH,\TMP3
+       movdqa    \GH,\TMP4
+       psrld     $1,\TMP2              # packed left shift >>1
+       psrld     $2,\TMP3              # packed left shift >>2
+       psrld     $7,\TMP4              # packed left shift >>7
+       pxor      \TMP3,\TMP2           # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \GH
+       pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+       mov        arg7, %r10           # %r10 = AAD
+       mov        arg8, %r12           # %r12 = aadLen
+       mov        %r12, %r11
+       pxor       %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+       movd       (%r10), \TMP1
+       pslldq     $12, \TMP1
+       psrldq     $4, %xmm\i
+       pxor       \TMP1, %xmm\i
+       add        $4, %r10
+       sub        $4, %r12
+       jne        _get_AAD_loop\num_initial_blocks\operation
+       cmp        $16, %r11
+       je         _get_AAD_loop2_done\num_initial_blocks\operation
+       mov        $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+       psrldq     $4, %xmm\i
+       sub        $4, %r12
+       cmp        %r11, %r12
+       jne        _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
+
+       xor        %r11, %r11 # initialise the data pointer offset as zero
+
+        # start AES for num_initial_blocks blocks
+
+       mov        %arg5, %rax                      # %rax = *Y0
+       movdqu     (%rax), \XMM0                    # XMM0 = Y0
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+       paddd      ONE(%rip), \XMM0                 # INCR Y0
+       movdqa     \XMM0, %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+       pxor       16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+       movaps 0x10(%rdi), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+       movaps 0x20(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x30(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x40(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x50(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x60(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x70(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x80(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x90(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0xa0(%arg1), \TMP1
+       AESENCLAST \TMP1, %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+       movdqu     (%arg3 , %r11, 1), \TMP1
+       pxor       \TMP1, %xmm\index
+       movdqu     %xmm\index, (%arg2 , %r11, 1)
+       # write back plaintext/ciphertext for num_initial_blocks
+       add        $16, %r11
+
+       movdqa     \TMP1, %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM         %xmm14, %xmm\index
+
+               # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+       GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+       GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+       cmp        $64, %r13
+       jl      _initial_blocks_done\num_initial_blocks\operation
+       # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM1
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM2
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM3
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM4
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
+
+       pxor       16*0(%arg1), \XMM1
+       pxor       16*0(%arg1), \XMM2
+       pxor       16*0(%arg1), \XMM3
+       pxor       16*0(%arg1), \XMM4
+       movdqa     \TMP3, \TMP5
+       pshufd     $78, \TMP3, \TMP1
+       pxor       \TMP3, \TMP1
+       movdqa     \TMP1, HashKey_k(%rsp)
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+       movdqa     \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+       movaps 0x10*\index(%arg1), \TMP1
+       AESENC     \TMP1, \XMM1
+       AESENC     \TMP1, \XMM2
+       AESENC     \TMP1, \XMM3
+       AESENC     \TMP1, \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_3(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+       movaps 0x10*\index(%arg1), \TMP1
+       AESENC     \TMP1, \XMM1
+       AESENC     \TMP1, \XMM2
+       AESENC     \TMP1, \XMM3
+       AESENC     \TMP1, \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_4(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_4_k(%rsp)
+       movaps 0xa0(%arg1), \TMP2
+       AESENCLAST \TMP2, \XMM1
+       AESENCLAST \TMP2, \XMM2
+       AESENCLAST \TMP2, \XMM3
+       AESENCLAST \TMP2, \XMM4
+       movdqu     16*0(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM1
+       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM1
+       movdqu     16*1(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM2
+       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM2
+       movdqu     16*2(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM3
+       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM3
+       movdqu     16*3(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM4
+       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM4
+       add        $64, %r11
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+       pxor       \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+       mov        arg7, %r10           # %r10 = AAD
+       mov        arg8, %r12           # %r12 = aadLen
+       mov        %r12, %r11
+       pxor       %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+       movd       (%r10), \TMP1
+       pslldq     $12, \TMP1
+       psrldq     $4, %xmm\i
+       pxor       \TMP1, %xmm\i
+       add        $4, %r10
+       sub        $4, %r12
+       jne        _get_AAD_loop\num_initial_blocks\operation
+       cmp        $16, %r11
+       je         _get_AAD_loop2_done\num_initial_blocks\operation
+       mov        $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+       psrldq     $4, %xmm\i
+       sub        $4, %r12
+       cmp        %r11, %r12
+       jne        _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
+
+       xor        %r11, %r11 # initialise the data pointer offset as zero
+
+        # start AES for num_initial_blocks blocks
+
+       mov        %arg5, %rax                      # %rax = *Y0
+       movdqu     (%rax), \XMM0                    # XMM0 = Y0
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+       paddd      ONE(%rip), \XMM0                 # INCR Y0
+       movdqa     \XMM0, %xmm\index
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+       pxor       16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+       movaps 0x10(%rdi), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+       movaps 0x20(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x30(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x40(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x50(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x60(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x70(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x80(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0x90(%arg1), \TMP1
+       AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       movaps 0xa0(%arg1), \TMP1
+       AESENCLAST \TMP1, %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+       movdqu     (%arg3 , %r11, 1), \TMP1
+       pxor       \TMP1, %xmm\index
+       movdqu     %xmm\index, (%arg2 , %r11, 1)
+       # write back plaintext/ciphertext for num_initial_blocks
+       add        $16, %r11
+
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM         %xmm14, %xmm\index
+
+               # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+       GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+       GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+       cmp        $64, %r13
+       jl      _initial_blocks_done\num_initial_blocks\operation
+       # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM1
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM2
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM3
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
+
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM4
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
+
+       pxor       16*0(%arg1), \XMM1
+       pxor       16*0(%arg1), \XMM2
+       pxor       16*0(%arg1), \XMM3
+       pxor       16*0(%arg1), \XMM4
+       movdqa     \TMP3, \TMP5
+       pshufd     $78, \TMP3, \TMP1
+       pxor       \TMP3, \TMP1
+       movdqa     \TMP1, HashKey_k(%rsp)
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+       movdqa     \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+       movaps 0x10*\index(%arg1), \TMP1
+       AESENC     \TMP1, \XMM1
+       AESENC     \TMP1, \XMM2
+       AESENC     \TMP1, \XMM3
+       AESENC     \TMP1, \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_3(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+       movaps 0x10*\index(%arg1), \TMP1
+       AESENC     \TMP1, \XMM1
+       AESENC     \TMP1, \XMM2
+       AESENC     \TMP1, \XMM3
+       AESENC     \TMP1, \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_4(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_4_k(%rsp)
+       movaps 0xa0(%arg1), \TMP2
+       AESENCLAST \TMP2, \XMM1
+       AESENCLAST \TMP2, \XMM2
+       AESENCLAST \TMP2, \XMM3
+       AESENCLAST \TMP2, \XMM4
+       movdqu     16*0(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM1
+       movdqu     16*1(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM2
+       movdqu     16*2(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM3
+       movdqu     16*3(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM4
+       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+
+       add        $64, %r11
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+       pxor       \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+        movdqa     SHUF_MASK(%rip), %xmm14
+       PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+       movdqa    \XMM1, \XMM5
+       movdqa    \XMM2, \XMM6
+       movdqa    \XMM3, \XMM7
+       movdqa    \XMM4, \XMM8
+
+        movdqa    SHUF_MASK(%rip), %xmm15
+        # multiply TMP5 * HashKey using karatsuba
+
+       movdqa    \XMM5, \TMP4
+       pshufd    $78, \XMM5, \TMP6
+       pxor      \XMM5, \TMP6
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    HashKey_4(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+       movdqa    \XMM0, \XMM1
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM2
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM3
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM4
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+       PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
+
+       pxor      (%arg1), \XMM1
+       pxor      (%arg1), \XMM2
+       pxor      (%arg1), \XMM3
+       pxor      (%arg1), \XMM4
+       movdqa    HashKey_4_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+       movaps 0x10(%arg1), \TMP1
+       AESENC    \TMP1, \XMM1              # Round 1
+       AESENC    \TMP1, \XMM2
+       AESENC    \TMP1, \XMM3
+       AESENC    \TMP1, \XMM4
+       movaps 0x20(%arg1), \TMP1
+       AESENC    \TMP1, \XMM1              # Round 2
+       AESENC    \TMP1, \XMM2
+       AESENC    \TMP1, \XMM3
+       AESENC    \TMP1, \XMM4
+       movdqa    \XMM6, \TMP1
+       pshufd    $78, \XMM6, \TMP2
+       pxor      \XMM6, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+       movaps 0x30(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 3
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+       movaps 0x40(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 4
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       movdqa    HashKey_3_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       movaps 0x50(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 5
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM6, \XMM5
+       pxor      \TMP2, \TMP6
+       movdqa    \XMM7, \TMP1
+       pshufd    $78, \XMM7, \TMP2
+       pxor      \XMM7, \TMP2
+       movdqa    HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+       PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+       movaps 0x60(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 6
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+       movaps 0x70(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1             # Round 7
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       movdqa    HashKey_2_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       movaps 0x80(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1             # Round 8
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM7, \XMM5
+       pxor      \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+       movdqa    \XMM8, \TMP1
+       pshufd    $78, \XMM8, \TMP2
+       pxor      \XMM8, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+       movaps 0x90(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1            # Round 9
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+       movaps 0xa0(%arg1), \TMP3
+       AESENCLAST \TMP3, \XMM1           # Round 10
+       AESENCLAST \TMP3, \XMM2
+       AESENCLAST \TMP3, \XMM3
+       AESENCLAST \TMP3, \XMM4
+       movdqa    HashKey_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+       movdqu    (%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+       movdqu    16(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+       movdqu    32(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+       movdqu    48(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
+
+       pxor      \TMP4, \TMP1
+       pxor      \XMM8, \XMM5
+       pxor      \TMP6, \TMP2
+       pxor      \TMP1, \TMP2
+       pxor      \XMM5, \TMP2
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
+       pxor      \TMP3, \XMM5
+       pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+       movdqa    \XMM5, \TMP2
+       movdqa    \XMM5, \TMP3
+       movdqa    \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+       pslld     $31, \TMP2                   # packed right shift << 31
+       pslld     $30, \TMP3                   # packed right shift << 30
+       pslld     $25, \TMP4                   # packed right shift << 25
+       pxor      \TMP3, \TMP2                 # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5                    # right shift T5 1 DW
+       pslldq    $12, \TMP2                   # left shift T2 3 DWs
+       pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+       movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+       movdqa    \XMM5,\TMP3
+       movdqa    \XMM5,\TMP4
+       psrld     $1, \TMP2                    # packed left shift >>1
+       psrld     $2, \TMP3                    # packed left shift >>2
+       psrld     $7, \TMP4                    # packed left shift >>7
+       pxor      \TMP3,\TMP2                  # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \XMM5
+       pxor      \TMP1, \XMM5                 # result is in TMP1
+
+       pxor      \XMM5, \XMM1
+.endm
+
+/*
+* decrypt 4 blocks at a time
+* ghash the 4 previously decrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+       movdqa    \XMM1, \XMM5
+       movdqa    \XMM2, \XMM6
+       movdqa    \XMM3, \XMM7
+       movdqa    \XMM4, \XMM8
+
+        movdqa    SHUF_MASK(%rip), %xmm15
+        # multiply TMP5 * HashKey using karatsuba
+
+       movdqa    \XMM5, \TMP4
+       pshufd    $78, \XMM5, \TMP6
+       pxor      \XMM5, \TMP6
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    HashKey_4(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+       movdqa    \XMM0, \XMM1
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM2
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM3
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM4
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+       PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
+
+       pxor      (%arg1), \XMM1
+       pxor      (%arg1), \XMM2
+       pxor      (%arg1), \XMM3
+       pxor      (%arg1), \XMM4
+       movdqa    HashKey_4_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+       movaps 0x10(%arg1), \TMP1
+       AESENC    \TMP1, \XMM1              # Round 1
+       AESENC    \TMP1, \XMM2
+       AESENC    \TMP1, \XMM3
+       AESENC    \TMP1, \XMM4
+       movaps 0x20(%arg1), \TMP1
+       AESENC    \TMP1, \XMM1              # Round 2
+       AESENC    \TMP1, \XMM2
+       AESENC    \TMP1, \XMM3
+       AESENC    \TMP1, \XMM4
+       movdqa    \XMM6, \TMP1
+       pshufd    $78, \XMM6, \TMP2
+       pxor      \XMM6, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+       movaps 0x30(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 3
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+       movaps 0x40(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 4
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       movdqa    HashKey_3_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       movaps 0x50(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 5
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM6, \XMM5
+       pxor      \TMP2, \TMP6
+       movdqa    \XMM7, \TMP1
+       pshufd    $78, \XMM7, \TMP2
+       pxor      \XMM7, \TMP2
+       movdqa    HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+       PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+       movaps 0x60(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1              # Round 6
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+       movaps 0x70(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1             # Round 7
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       movdqa    HashKey_2_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       movaps 0x80(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1             # Round 8
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM7, \XMM5
+       pxor      \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+       movdqa    \XMM8, \TMP1
+       pshufd    $78, \XMM8, \TMP2
+       pxor      \XMM8, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+       movaps 0x90(%arg1), \TMP3
+       AESENC    \TMP3, \XMM1            # Round 9
+       AESENC    \TMP3, \XMM2
+       AESENC    \TMP3, \XMM3
+       AESENC    \TMP3, \XMM4
+       PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+       movaps 0xa0(%arg1), \TMP3
+       AESENCLAST \TMP3, \XMM1           # Round 10
+       AESENCLAST \TMP3, \XMM2
+       AESENCLAST \TMP3, \XMM3
+       AESENCLAST \TMP3, \XMM4
+       movdqa    HashKey_k(%rsp), \TMP5
+       PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+       movdqu    (%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+       movdqu    \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+       movdqa    \TMP3, \XMM1
+       movdqu    16(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+       movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM2
+       movdqu    32(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+       movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM3
+       movdqu    48(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+       movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM4
+       PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
+       PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
+
+       pxor      \TMP4, \TMP1
+       pxor      \XMM8, \XMM5
+       pxor      \TMP6, \TMP2
+       pxor      \TMP1, \TMP2
+       pxor      \XMM5, \TMP2
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
+       pxor      \TMP3, \XMM5
+       pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+       movdqa    \XMM5, \TMP2
+       movdqa    \XMM5, \TMP3
+       movdqa    \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+       pslld     $31, \TMP2                   # packed right shift << 31
+       pslld     $30, \TMP3                   # packed right shift << 30
+       pslld     $25, \TMP4                   # packed right shift << 25
+       pxor      \TMP3, \TMP2                 # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5                    # right shift T5 1 DW
+       pslldq    $12, \TMP2                   # left shift T2 3 DWs
+       pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+       movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+       movdqa    \XMM5,\TMP3
+       movdqa    \XMM5,\TMP4
+       psrld     $1, \TMP2                    # packed left shift >>1
+       psrld     $2, \TMP3                    # packed left shift >>2
+       psrld     $7, \TMP4                    # packed left shift >>7
+       pxor      \TMP3,\TMP2                  # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \XMM5
+       pxor      \TMP1, \XMM5                 # result is in TMP1
+
+       pxor      \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+       movdqa    \XMM1, \TMP6
+       pshufd    $78, \XMM1, \TMP2
+       pxor      \XMM1, \TMP2
+       movdqa    HashKey_4(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+       PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+       movdqa    HashKey_4_k(%rsp), \TMP4
+       PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       movdqa    \XMM1, \XMMDst
+       movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+       movdqa    \XMM2, \TMP1
+       pshufd    $78, \XMM2, \TMP2
+       pxor      \XMM2, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+       movdqa    HashKey_3_k(%rsp), \TMP4
+       PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM2, \XMMDst
+       pxor      \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+       movdqa    \XMM3, \TMP1
+       pshufd    $78, \XMM3, \TMP2
+       pxor      \XMM3, \TMP2
+       movdqa    HashKey_2(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+       movdqa    HashKey_2_k(%rsp), \TMP4
+       PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM3, \XMMDst
+       pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+       movdqa    \XMM4, \TMP1
+       pshufd    $78, \XMM4, \TMP2
+       pxor      \XMM4, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       PCLMULQDQ 0x11, \TMP5, \TMP1        # TMP1 = a1*b1
+       PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+       movdqa    HashKey_k(%rsp), \TMP4
+       PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM4, \XMMDst
+       pxor      \XMM1, \TMP2
+       pxor      \TMP6, \TMP2
+       pxor      \XMMDst, \TMP2
+       # middle section of the temp results combined as in karatsuba algorithm
+       movdqa    \TMP2, \TMP4
+       pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
+       psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
+       pxor      \TMP4, \XMMDst
+       pxor      \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+       # first phase of the reduction
+       movdqa    \XMMDst, \TMP2
+       movdqa    \XMMDst, \TMP3
+       movdqa    \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+       pslld     $31, \TMP2                # packed right shifting << 31
+       pslld     $30, \TMP3                # packed right shifting << 30
+       pslld     $25, \TMP4                # packed right shifting << 25
+       pxor      \TMP3, \TMP2              # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP7
+       psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+       pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+       pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+       movdqa    \XMMDst, \TMP2
+       # make 3 copies of XMMDst for doing 3 shift operations
+       movdqa    \XMMDst, \TMP3
+       movdqa    \XMMDst, \TMP4
+       psrld     $1, \TMP2                 # packed left shift >> 1
+       psrld     $2, \TMP3                 # packed left shift >> 2
+       psrld     $7, \TMP4                 # packed left shift >> 7
+       pxor      \TMP3, \TMP2              # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       pxor      \TMP7, \TMP2
+       pxor      \TMP2, \XMMDst
+       pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+
+       pxor    (%arg1), \XMM0
+        movaps 16(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 32(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 48(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 64(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 80(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 96(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 112(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 128(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 144(%arg1), \TMP1
+       AESENC  \TMP1, \XMM0
+        movaps 160(%arg1), \TMP1
+       AESENCLAST      \TMP1, \XMM0
+.endm
+
+
+/*****************************************************************************
+* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
+*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
+*                   const u8 *in,      // Ciphertext input
+*                   u64 plaintext_len, // Length of data in bytes for decryption.
+*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,     // Additional Authentication Data (AAD)
+*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
+*                                      // given authentication tag and only return the plaintext if they match.
+*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
+*                                      // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first
+*       set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                        AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+       push    %r12
+       push    %r13
+       push    %r14
+       mov     %rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+       sub     $VARIABLE_OFFSET, %rsp
+       and     $~63, %rsp                        # align rsp to 64 bytes
+       mov     %arg6, %r12
+       movdqu  (%r12), %xmm13                    # %xmm13 = HashKey
+        movdqa  SHUF_MASK(%rip), %xmm2
+       PSHUFB_XMM %xmm2, %xmm13
+
+
+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
+
+       movdqa  %xmm13, %xmm2
+       psllq   $1, %xmm13
+       psrlq   $63, %xmm2
+       movdqa  %xmm2, %xmm1
+       pslldq  $8, %xmm2
+       psrldq  $8, %xmm1
+       por     %xmm2, %xmm13
+
+        # Reduction
+
+       pshufd  $0x24, %xmm1, %xmm2
+       pcmpeqd TWOONE(%rip), %xmm2
+       pand    POLY(%rip), %xmm2
+       pxor    %xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+       movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
+       mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
+       and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
+       mov %r13, %r12
+       and $(3<<4), %r12
+       jz _initial_num_blocks_is_0_decrypt
+       cmp $(2<<4), %r12
+       jb _initial_num_blocks_is_1_decrypt
+       je _initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+       INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+       sub     $48, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+       INITIAL_BLOCKS_DEC      2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+       sub     $32, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+       INITIAL_BLOCKS_DEC      1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+       sub     $16, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+       INITIAL_BLOCKS_DEC      0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+       cmp     $0, %r13
+       je      _zero_cipher_left_decrypt
+       sub     $64, %r13
+       je      _four_cipher_left_decrypt
+_decrypt_by_4:
+       GHASH_4_ENCRYPT_4_PARALLEL_DEC  %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+       add     $64, %r11
+       sub     $64, %r13
+       jne     _decrypt_by_4
+_four_cipher_left_decrypt:
+       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+       mov     %arg4, %r13
+       and     $15, %r13                               # %r13 = arg4 (mod 16)
+       je      _multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+       paddd ONE(%rip), %xmm0         # increment CNT to get Yn
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm0
+
+       ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
+       sub $16, %r11
+       add %r13, %r11
+       movdqu (%arg3,%r11,1), %xmm1   # recieve the last <16 byte block
+       lea SHIFT_MASK+16(%rip), %r12
+       sub %r13, %r12
+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+# (%r13 is the number of bytes in plaintext mod 16)
+       movdqu (%r12), %xmm2           # get the appropriate shuffle mask
+       PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
+
+       movdqa  %xmm1, %xmm2
+       pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
+       movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+       # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+       pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
+       pand    %xmm1, %xmm2
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10 ,%xmm2
+
+       pxor %xmm2, %xmm8
+       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+                 # GHASH computation for the last <16 byte block
+       sub %r13, %r11
+       add $16, %r11
+
+        # output %r13 bytes
+       MOVQ_R64_XMM    %xmm0, %rax
+       cmp     $8, %r13
+       jle     _less_than_8_bytes_left_decrypt
+       mov     %rax, (%arg2 , %r11, 1)
+       add     $8, %r11
+       psrldq  $8, %xmm0
+       MOVQ_R64_XMM    %xmm0, %rax
+       sub     $8, %r13
+_less_than_8_bytes_left_decrypt:
+       mov     %al,  (%arg2, %r11, 1)
+       add     $1, %r11
+       shr     $8, %rax
+       sub     $1, %r13
+       jne     _less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+       mov     arg8, %r12                # %r13 = aadLen (number of bytes)
+       shl     $3, %r12                  # convert into number of bits
+       movd    %r12d, %xmm15             # len(A) in %xmm15
+       shl     $3, %arg4                 # len(C) in bits (*128)
+       MOVQ_R64_XMM    %arg4, %xmm1
+       pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
+       pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
+       pxor    %xmm15, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+                # final GHASH computation
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm8
+
+       mov     %arg5, %rax               # %rax = *Y0
+       movdqu  (%rax), %xmm0             # %xmm0 = Y0
+       ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
+       pxor    %xmm8, %xmm0
+_return_T_decrypt:
+       mov     arg9, %r10                # %r10 = authTag
+       mov     arg10, %r11               # %r11 = auth_tag_len
+       cmp     $16, %r11
+       je      _T_16_decrypt
+       cmp     $12, %r11
+       je      _T_12_decrypt
+_T_8_decrypt:
+       MOVQ_R64_XMM    %xmm0, %rax
+       mov     %rax, (%r10)
+       jmp     _return_T_done_decrypt
+_T_12_decrypt:
+       MOVQ_R64_XMM    %xmm0, %rax
+       mov     %rax, (%r10)
+       psrldq  $8, %xmm0
+       movd    %xmm0, %eax
+       mov     %eax, 8(%r10)
+       jmp     _return_T_done_decrypt
+_T_16_decrypt:
+       movdqu  %xmm0, (%r10)
+_return_T_done_decrypt:
+       mov     %r14, %rsp
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       ret
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,       // Plaintext input
+*                    u64 plaintext_len,  // Length of data in bytes for encryption.
+*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,      // Additional Authentication Data (AAD)
+*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,       // Authenticated Tag output.
+*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+*                                        // 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the
+*       first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                 AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                         AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+       push    %r12
+       push    %r13
+       push    %r14
+       mov     %rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+       sub     $VARIABLE_OFFSET, %rsp
+       and     $~63, %rsp
+       mov     %arg6, %r12
+       movdqu  (%r12), %xmm13
+        movdqa  SHUF_MASK(%rip), %xmm2
+       PSHUFB_XMM %xmm2, %xmm13
+
+
+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+       movdqa  %xmm13, %xmm2
+       psllq   $1, %xmm13
+       psrlq   $63, %xmm2
+       movdqa  %xmm2, %xmm1
+       pslldq  $8, %xmm2
+       psrldq  $8, %xmm1
+       por     %xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+       pshufd  $0x24, %xmm1, %xmm2
+       pcmpeqd TWOONE(%rip), %xmm2
+       pand    POLY(%rip), %xmm2
+       pxor    %xmm2, %xmm13
+       movdqa  %xmm13, HashKey(%rsp)
+       mov     %arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
+       and     $-16, %r13
+       mov     %r13, %r12
+
+        # Encrypt first few blocks
+
+       and     $(3<<4), %r12
+       jz      _initial_num_blocks_is_0_encrypt
+       cmp     $(2<<4), %r12
+       jb      _initial_num_blocks_is_1_encrypt
+       je      _initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+       INITIAL_BLOCKS_ENC      3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+       sub     $48, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+       INITIAL_BLOCKS_ENC      2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+       sub     $32, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+       INITIAL_BLOCKS_ENC      1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+       sub     $16, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+       INITIAL_BLOCKS_ENC      0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+       cmp     $0, %r13
+       je      _zero_cipher_left_encrypt
+       sub     $64, %r13
+       je      _four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+       GHASH_4_ENCRYPT_4_PARALLEL_ENC  %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+       add     $64, %r11
+       sub     $64, %r13
+       jne     _encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+       mov     %arg4, %r13
+       and     $15, %r13                       # %r13 = arg4 (mod 16)
+       je      _multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+       paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm0
+
+       ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
+       sub $16, %r11
+       add %r13, %r11
+       movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
+       lea SHIFT_MASK+16(%rip), %r12
+       sub %r13, %r12
+       # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+       # (%r13 is the number of bytes in plaintext mod 16)
+       movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
+       PSHUFB_XMM      %xmm2, %xmm1            # shift right 16-r13 byte
+       pxor    %xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
+       movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
+       # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+       pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10,%xmm0
+
+       pxor    %xmm0, %xmm8
+       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+       # GHASH computation for the last <16 byte block
+       sub     %r13, %r11
+       add     $16, %r11
+       PSHUFB_XMM %xmm10, %xmm1
+
+       # shuffle xmm0 back to output as ciphertext
+
+        # Output %r13 bytes
+       MOVQ_R64_XMM %xmm0, %rax
+       cmp $8, %r13
+       jle _less_than_8_bytes_left_encrypt
+       mov %rax, (%arg2 , %r11, 1)
+       add $8, %r11
+       psrldq $8, %xmm0
+       MOVQ_R64_XMM %xmm0, %rax
+       sub $8, %r13
+_less_than_8_bytes_left_encrypt:
+       mov %al,  (%arg2, %r11, 1)
+       add $1, %r11
+       shr $8, %rax
+       sub $1, %r13
+       jne _less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+       mov     arg8, %r12    # %r12 = addLen (number of bytes)
+       shl     $3, %r12
+       movd    %r12d, %xmm15       # len(A) in %xmm15
+       shl     $3, %arg4               # len(C) in bits (*128)
+       MOVQ_R64_XMM    %arg4, %xmm1
+       pslldq  $8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
+       pxor    %xmm1, %xmm15       # %xmm15 = len(A)||len(C)
+       pxor    %xmm15, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+       # final GHASH computation
+        movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
+
+       mov     %arg5, %rax                    # %rax  = *Y0
+       movdqu  (%rax), %xmm0                  # %xmm0 = Y0
+       ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm15         # Encrypt(K, Y0)
+       pxor    %xmm8, %xmm0
+_return_T_encrypt:
+       mov     arg9, %r10                     # %r10 = authTag
+       mov     arg10, %r11                    # %r11 = auth_tag_len
+       cmp     $16, %r11
+       je      _T_16_encrypt
+       cmp     $12, %r11
+       je      _T_12_encrypt
+_T_8_encrypt:
+       MOVQ_R64_XMM    %xmm0, %rax
+       mov     %rax, (%r10)
+       jmp     _return_T_done_encrypt
+_T_12_encrypt:
+       MOVQ_R64_XMM    %xmm0, %rax
+       mov     %rax, (%r10)
+       psrldq  $8, %xmm0
+       movd    %xmm0, %eax
+       mov     %eax, 8(%r10)
+       jmp     _return_T_done_encrypt
+_T_16_encrypt:
+       movdqu  %xmm0, (%r10)
+_return_T_done_encrypt:
+       mov     %r14, %rsp
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       ret
+
+#endif
+
  
  _key_expansion_128:
  _key_expansion_256a:
@@ -55,10 +1709,11 @@ _key_expansion_256a:
         shufps $0b10001100, %xmm0, %xmm4
         pxor %xmm4, %xmm0
         pxor %xmm1, %xmm0
-       movaps %xmm0, (%rcx)
-       add $0x10, %rcx
+       movaps %xmm0, (TKEYP)
+       add $0x10, TKEYP
         ret
  
+.align 4
  _key_expansion_192a:
         pshufd $0b01010101, %xmm1, %xmm1
         shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1731,13 @@ _key_expansion_192a:
  
         movaps %xmm0, %xmm1
         shufps $0b01000100, %xmm0, %xmm6
-       movaps %xmm6, (%rcx)
+       movaps %xmm6, (TKEYP)
         shufps $0b01001110, %xmm2, %xmm1
-       movaps %xmm1, 16(%rcx)
-       add $0x20, %rcx
+       movaps %xmm1, 0x10(TKEYP)
+       add $0x20, TKEYP
         ret
  
+.align 4
  _key_expansion_192b:
         pshufd $0b01010101, %xmm1, %xmm1
         shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1752,11 @@ _key_expansion_192b:
         pxor %xmm3, %xmm2
         pxor %xmm5, %xmm2
  
-       movaps %xmm0, (%rcx)
-       add $0x10, %rcx
+       movaps %xmm0, (TKEYP)
+       add $0x10, TKEYP
         ret
  
+.align 4
  _key_expansion_256b:
         pshufd $0b10101010, %xmm1, %xmm1
         shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1764,8 @@ _key_expansion_256b:
         shufps $0b10001100, %xmm2, %xmm4
         pxor %xmm4, %xmm2
         pxor %xmm1, %xmm2
-       movaps %xmm2, (%rcx)
-       add $0x10, %rcx
+       movaps %xmm2, (TKEYP)
+       add $0x10, TKEYP
         ret
  
  /*
@@ -116,17 +1773,23 @@ _key_expansion_256b:
   *                   unsigned int key_len)
   */
  ENTRY(aesni_set_key)
-       movups (%rsi), %xmm0            # user key (first 16 bytes)
-       movaps %xmm0, (%rdi)
-       lea 0x10(%rdi), %rcx            # key addr
-       movl %edx, 480(%rdi)
+#ifndef __x86_64__
+       pushl KEYP
+       movl 8(%esp), KEYP              # ctx
+       movl 12(%esp), UKEYP            # in_key
+       movl 16(%esp), %edx             # key_len
+#endif
+       movups (UKEYP), %xmm0           # user key (first 16 bytes)
+       movaps %xmm0, (KEYP)
+       lea 0x10(KEYP), TKEYP           # key addr
+       movl %edx, 480(KEYP)
         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
         cmp $24, %dl
         jb .Lenc_key128
         je .Lenc_key192
-       movups 0x10(%rsi), %xmm2        # other user key
-       movaps %xmm2, (%rcx)
-       add $0x10, %rcx
+       movups 0x10(UKEYP), %xmm2       # other user key
+       movaps %xmm2, (TKEYP)
+       add $0x10, TKEYP
         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
         call _key_expansion_256a
         AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1818,7 @@ ENTRY(aesni_set_key)
         call _key_expansion_256a
         jmp .Ldec_key
  .Lenc_key192:
-       movq 0x10(%rsi), %xmm2          # other user key
+       movq 0x10(UKEYP), %xmm2         # other user key
         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
         call _key_expansion_192a
         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
@@ -195,33 +1858,47 @@ ENTRY(aesni_set_key)
         AESKEYGENASSIST 0x36 %xmm0 %xmm1        # round 10
         call _key_expansion_128
  .Ldec_key:
-       sub $0x10, %rcx
-       movaps (%rdi), %xmm0
-       movaps (%rcx), %xmm1
-       movaps %xmm0, 240(%rcx)
-       movaps %xmm1, 240(%rdi)
-       add $0x10, %rdi
-       lea 240-16(%rcx), %rsi
+       sub $0x10, TKEYP
+       movaps (KEYP), %xmm0
+       movaps (TKEYP), %xmm1
+       movaps %xmm0, 240(TKEYP)
+       movaps %xmm1, 240(KEYP)
+       add $0x10, KEYP
+       lea 240-16(TKEYP), UKEYP
  .align 4
  .Ldec_key_loop:
-       movaps (%rdi), %xmm0
+       movaps (KEYP), %xmm0
         AESIMC %xmm0 %xmm1
-       movaps %xmm1, (%rsi)
-       add $0x10, %rdi
-       sub $0x10, %rsi
-       cmp %rcx, %rdi
+       movaps %xmm1, (UKEYP)
+       add $0x10, KEYP
+       sub $0x10, UKEYP
+       cmp TKEYP, KEYP
         jb .Ldec_key_loop
-       xor %rax, %rax
+       xor AREG, AREG
+#ifndef __x86_64__
+       popl KEYP
+#endif
         ret
  
  /*
   * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
   */
  ENTRY(aesni_enc)
+#ifndef __x86_64__
+       pushl KEYP
+       pushl KLEN
+       movl 12(%esp), KEYP
+       movl 16(%esp), OUTP
+       movl 20(%esp), INP
+#endif
         movl 480(KEYP), KLEN            # key length
         movups (INP), STATE             # input
         call _aesni_enc1
         movups STATE, (OUTP)            # output
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+#endif
         ret
  
  /*
@@ -236,6 +1913,7 @@ ENTRY(aesni_enc)
   *     KEY
   *     TKEYP (T1)
   */
+.align 4
  _aesni_enc1:
         movaps (KEYP), KEY              # key
         mov KEYP, TKEYP
@@ -298,6 +1976,7 @@ _aesni_enc1:
   *     KEY
   *     TKEYP (T1)
   */
+.align 4
  _aesni_enc4:
         movaps (KEYP), KEY              # key
         mov KEYP, TKEYP
@@ -391,11 +2070,22 @@ _aesni_enc4:
   * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
   */
  ENTRY(aesni_dec)
+#ifndef __x86_64__
+       pushl KEYP
+       pushl KLEN
+       movl 12(%esp), KEYP
+       movl 16(%esp), OUTP
+       movl 20(%esp), INP
+#endif
         mov 480(KEYP), KLEN             # key length
         add $240, KEYP
         movups (INP), STATE             # input
         call _aesni_dec1
         movups STATE, (OUTP)            #output
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+#endif
         ret
  
  /*
@@ -410,6 +2100,7 @@ ENTRY(aesni_dec)
   *     KEY
   *     TKEYP (T1)
   */
+.align 4
  _aesni_dec1:
         movaps (KEYP), KEY              # key
         mov KEYP, TKEYP
@@ -472,6 +2163,7 @@ _aesni_dec1:
   *     KEY
   *     TKEYP (T1)
   */
+.align 4
  _aesni_dec4:
         movaps (KEYP), KEY              # key
         mov KEYP, TKEYP
@@ -566,6 +2258,15 @@ _aesni_dec4:
   *                   size_t len)
   */
  ENTRY(aesni_ecb_enc)
+#ifndef __x86_64__
+       pushl LEN
+       pushl KEYP
+       pushl KLEN
+       movl 16(%esp), KEYP
+       movl 20(%esp), OUTP
+       movl 24(%esp), INP
+       movl 28(%esp), LEN
+#endif
         test LEN, LEN           # check length
         jz .Lecb_enc_ret
         mov 480(KEYP), KLEN
@@ -602,6 +2303,11 @@ ENTRY(aesni_ecb_enc)
         cmp $16, LEN
         jge .Lecb_enc_loop1
  .Lecb_enc_ret:
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+       popl LEN
+#endif
         ret
  
  /*
@@ -609,6 +2315,15 @@ ENTRY(aesni_ecb_enc)
   *                   size_t len);
   */
  ENTRY(aesni_ecb_dec)
+#ifndef __x86_64__
+       pushl LEN
+       pushl KEYP
+       pushl KLEN
+       movl 16(%esp), KEYP
+       movl 20(%esp), OUTP
+       movl 24(%esp), INP
+       movl 28(%esp), LEN
+#endif
         test LEN, LEN
         jz .Lecb_dec_ret
         mov 480(KEYP), KLEN
@@ -646,6 +2361,11 @@ ENTRY(aesni_ecb_dec)
         cmp $16, LEN
         jge .Lecb_dec_loop1
  .Lecb_dec_ret:
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+       popl LEN
+#endif
         ret
  
  /*
@@ -653,6 +2373,17 @@ ENTRY(aesni_ecb_dec)
   *                   size_t len, u8 *iv)
   */
  ENTRY(aesni_cbc_enc)
+#ifndef __x86_64__
+       pushl IVP
+       pushl LEN
+       pushl KEYP
+       pushl KLEN
+       movl 20(%esp), KEYP
+       movl 24(%esp), OUTP
+       movl 28(%esp), INP
+       movl 32(%esp), LEN
+       movl 36(%esp), IVP
+#endif
         cmp $16, LEN
         jb .Lcbc_enc_ret
         mov 480(KEYP), KLEN
@@ -670,6 +2401,12 @@ ENTRY(aesni_cbc_enc)
         jge .Lcbc_enc_loop
         movups STATE, (IVP)
  .Lcbc_enc_ret:
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+       popl LEN
+       popl IVP
+#endif
         ret
  
  /*
@@ -677,6 +2414,17 @@ ENTRY(aesni_cbc_enc)
   *                   size_t len, u8 *iv)
   */
  ENTRY(aesni_cbc_dec)
+#ifndef __x86_64__
+       pushl IVP
+       pushl LEN
+       pushl KEYP
+       pushl KLEN
+       movl 20(%esp), KEYP
+       movl 24(%esp), OUTP
+       movl 28(%esp), INP
+       movl 32(%esp), LEN
+       movl 36(%esp), IVP
+#endif
         cmp $16, LEN
         jb .Lcbc_dec_just_ret
         mov 480(KEYP), KLEN
@@ -690,16 +2438,30 @@ ENTRY(aesni_cbc_dec)
         movaps IN1, STATE1
         movups 0x10(INP), IN2
         movaps IN2, STATE2
+#ifdef __x86_64__
         movups 0x20(INP), IN3
         movaps IN3, STATE3
         movups 0x30(INP), IN4
         movaps IN4, STATE4
+#else
+       movups 0x20(INP), IN1
+       movaps IN1, STATE3
+       movups 0x30(INP), IN2
+       movaps IN2, STATE4
+#endif
         call _aesni_dec4
         pxor IV, STATE1
+#ifdef __x86_64__
         pxor IN1, STATE2
         pxor IN2, STATE3
         pxor IN3, STATE4
         movaps IN4, IV
+#else
+       pxor (INP), STATE2
+       pxor 0x10(INP), STATE3
+       pxor IN1, STATE4
+       movaps IN2, IV
+#endif
         movups STATE1, (OUTP)
         movups STATE2, 0x10(OUTP)
         movups STATE3, 0x20(OUTP)
@@ -727,8 +2489,15 @@ ENTRY(aesni_cbc_dec)
  .Lcbc_dec_ret:
         movups IV, (IVP)
  .Lcbc_dec_just_ret:
+#ifndef __x86_64__
+       popl KLEN
+       popl KEYP
+       popl LEN
+       popl IVP
+#endif
         ret
  
+#ifdef __x86_64__
  .align 16
  .Lbswap_mask:
         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2513,7 @@ ENTRY(aesni_cbc_dec)
   *     INC:    == 1, in little endian
   *     BSWAP_MASK == endian swapping mask
   */
+.align 4
  _aesni_inc_init:
         movaps .Lbswap_mask, BSWAP_MASK
         movaps IV, CTR
@@ -768,6 +2538,7 @@ _aesni_inc_init:
   *     CTR:    == output IV, in little endian
   *     TCTR_LOW: == lower qword of CTR
   */
+.align 4
  _aesni_inc:
         paddq INC, CTR
         add $1, TCTR_LOW
@@ -839,3 +2610,4 @@ ENTRY(aesni_ctr_enc)
         movups IV, (IVP)
  .Lctr_enc_just_ret:
         ret
+#endif