[CRYPTO] salsa20: Add x86-64 assembly version
authorTan Swee Heng <thesweeheng@gmail.com>
Mon, 17 Dec 2007 16:04:40 +0000 (00:04 +0800)
committerHerbert Xu <herbert@gondor.apana.org.au>
Thu, 10 Jan 2008 21:16:57 +0000 (08:16 +1100)
This is the x86-64 version of the Salsa20 stream cipher algorithm. The
original assembly code came from
<http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>. It has been
reformatted for clarity.

Signed-off-by: Tan Swee Heng <thesweeheng@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/Makefile
arch/x86/crypto/salsa20-x86_64-asm_64.S [new file with mode: 0644]
arch/x86/crypto/salsa20_glue.c
crypto/Kconfig

index 25cc844..09200e1 100644 (file)
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
@@ -15,3 +16,4 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
+salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644 (file)
index 0000000..6214a9b
--- /dev/null
@@ -0,0 +1,920 @@
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+       mov     %rsp,%r11
+       and     $31,%r11
+       add     $256,%r11
+       sub     %r11,%rsp
+       # x = arg1
+       mov     %rdi,%r8
+       # m = arg2
+       mov     %rsi,%rsi
+       # out = arg3
+       mov     %rdx,%rdi
+       # bytes = arg4
+       mov     %rcx,%rdx
+       #               unsigned>? bytes - 0
+       cmp     $0,%rdx
+       # comment:fp stack unchanged by jump
+       # goto done if !unsigned>
+       jbe     ._done
+       # comment:fp stack unchanged by fallthrough
+# start:
+._start:
+       # r11_stack = r11
+       movq    %r11,0(%rsp)
+       # r12_stack = r12
+       movq    %r12,8(%rsp)
+       # r13_stack = r13
+       movq    %r13,16(%rsp)
+       # r14_stack = r14
+       movq    %r14,24(%rsp)
+       # r15_stack = r15
+       movq    %r15,32(%rsp)
+       # rbx_stack = rbx
+       movq    %rbx,40(%rsp)
+       # rbp_stack = rbp
+       movq    %rbp,48(%rsp)
+       # in0 = *(uint64 *) (x + 0)
+       movq    0(%r8),%rcx
+       # in2 = *(uint64 *) (x + 8)
+       movq    8(%r8),%r9
+       # in4 = *(uint64 *) (x + 16)
+       movq    16(%r8),%rax
+       # in6 = *(uint64 *) (x + 24)
+       movq    24(%r8),%r10
+       # in8 = *(uint64 *) (x + 32)
+       movq    32(%r8),%r11
+       # in10 = *(uint64 *) (x + 40)
+       movq    40(%r8),%r12
+       # in12 = *(uint64 *) (x + 48)
+       movq    48(%r8),%r13
+       # in14 = *(uint64 *) (x + 56)
+       movq    56(%r8),%r14
+       # j0 = in0
+       movq    %rcx,56(%rsp)
+       # j2 = in2
+       movq    %r9,64(%rsp)
+       # j4 = in4
+       movq    %rax,72(%rsp)
+       # j6 = in6
+       movq    %r10,80(%rsp)
+       # j8 = in8
+       movq    %r11,88(%rsp)
+       # j10 = in10
+       movq    %r12,96(%rsp)
+       # j12 = in12
+       movq    %r13,104(%rsp)
+       # j14 = in14
+       movq    %r14,112(%rsp)
+       # x_backup = x
+       movq    %r8,120(%rsp)
+# bytesatleast1:
+._bytesatleast1:
+       #                   unsigned<? bytes - 64
+       cmp     $64,%rdx
+       # comment:fp stack unchanged by jump
+       #   goto nocopy if !unsigned<
+       jae     ._nocopy
+       #     ctarget = out
+       movq    %rdi,128(%rsp)
+       #     out = &tmp
+       leaq    192(%rsp),%rdi
+       #     i = bytes
+       mov     %rdx,%rcx
+       #     while (i) { *out++ = *m++; --i }
+       rep     movsb
+       #     out = &tmp
+       leaq    192(%rsp),%rdi
+       #     m = &tmp
+       leaq    192(%rsp),%rsi
+       # comment:fp stack unchanged by fallthrough
+#   nocopy:
+._nocopy:
+       #   out_backup = out
+       movq    %rdi,136(%rsp)
+       #   m_backup = m
+       movq    %rsi,144(%rsp)
+       #   bytes_backup = bytes
+       movq    %rdx,152(%rsp)
+       #   x1 = j0
+       movq    56(%rsp),%rdi
+       #   x0 = x1
+       mov     %rdi,%rdx
+       #   (uint64) x1 >>= 32
+       shr     $32,%rdi
+       #               x3 = j2
+       movq    64(%rsp),%rsi
+       #               x2 = x3
+       mov     %rsi,%rcx
+       #               (uint64) x3 >>= 32
+       shr     $32,%rsi
+       #   x5 = j4
+       movq    72(%rsp),%r8
+       #   x4 = x5
+       mov     %r8,%r9
+       #   (uint64) x5 >>= 32
+       shr     $32,%r8
+       #   x5_stack = x5
+       movq    %r8,160(%rsp)
+       #               x7 = j6
+       movq    80(%rsp),%r8
+       #               x6 = x7
+       mov     %r8,%rax
+       #               (uint64) x7 >>= 32
+       shr     $32,%r8
+       #   x9 = j8
+       movq    88(%rsp),%r10
+       #   x8 = x9
+       mov     %r10,%r11
+       #   (uint64) x9 >>= 32
+       shr     $32,%r10
+       #               x11 = j10
+       movq    96(%rsp),%r12
+       #               x10 = x11
+       mov     %r12,%r13
+       #               x10_stack = x10
+       movq    %r13,168(%rsp)
+       #               (uint64) x11 >>= 32
+       shr     $32,%r12
+       #   x13 = j12
+       movq    104(%rsp),%r13
+       #   x12 = x13
+       mov     %r13,%r14
+       #   (uint64) x13 >>= 32
+       shr     $32,%r13
+       #               x15 = j14
+       movq    112(%rsp),%r15
+       #               x14 = x15
+       mov     %r15,%rbx
+       #               (uint64) x15 >>= 32
+       shr     $32,%r15
+       #               x15_stack = x15
+       movq    %r15,176(%rsp)
+       #   i = 20
+       mov     $20,%r15
+#   mainloop:
+._mainloop:
+       #   i_backup = i
+       movq    %r15,184(%rsp)
+       #               x5 = x5_stack
+       movq    160(%rsp),%r15
+       # a = x12 + x0
+       lea     (%r14,%rdx),%rbp
+       # (uint32) a <<<= 7
+       rol     $7,%ebp
+       # x4 ^= a
+       xor     %rbp,%r9
+       #               b = x1 + x5
+       lea     (%rdi,%r15),%rbp
+       #               (uint32) b <<<= 7
+       rol     $7,%ebp
+       #               x9 ^= b
+       xor     %rbp,%r10
+       # a = x0 + x4
+       lea     (%rdx,%r9),%rbp
+       # (uint32) a <<<= 9
+       rol     $9,%ebp
+       # x8 ^= a
+       xor     %rbp,%r11
+       #               b = x5 + x9
+       lea     (%r15,%r10),%rbp
+       #               (uint32) b <<<= 9
+       rol     $9,%ebp
+       #               x13 ^= b
+       xor     %rbp,%r13
+       # a = x4 + x8
+       lea     (%r9,%r11),%rbp
+       # (uint32) a <<<= 13
+       rol     $13,%ebp
+       # x12 ^= a
+       xor     %rbp,%r14
+       #               b = x9 + x13
+       lea     (%r10,%r13),%rbp
+       #               (uint32) b <<<= 13
+       rol     $13,%ebp
+       #               x1 ^= b
+       xor     %rbp,%rdi
+       # a = x8 + x12
+       lea     (%r11,%r14),%rbp
+       # (uint32) a <<<= 18
+       rol     $18,%ebp
+       # x0 ^= a
+       xor     %rbp,%rdx
+       #               b = x13 + x1
+       lea     (%r13,%rdi),%rbp
+       #               (uint32) b <<<= 18
+       rol     $18,%ebp
+       #               x5 ^= b
+       xor     %rbp,%r15
+       #                               x10 = x10_stack
+       movq    168(%rsp),%rbp
+       #               x5_stack = x5
+       movq    %r15,160(%rsp)
+       #                               c = x6 + x10
+       lea     (%rax,%rbp),%r15
+       #                               (uint32) c <<<= 7
+       rol     $7,%r15d
+       #                               x14 ^= c
+       xor     %r15,%rbx
+       #                               c = x10 + x14
+       lea     (%rbp,%rbx),%r15
+       #                               (uint32) c <<<= 9
+       rol     $9,%r15d
+       #                               x2 ^= c
+       xor     %r15,%rcx
+       #                               c = x14 + x2
+       lea     (%rbx,%rcx),%r15
+       #                               (uint32) c <<<= 13
+       rol     $13,%r15d
+       #                               x6 ^= c
+       xor     %r15,%rax
+       #                               c = x2 + x6
+       lea     (%rcx,%rax),%r15
+       #                               (uint32) c <<<= 18
+       rol     $18,%r15d
+       #                               x10 ^= c
+       xor     %r15,%rbp
+       #                                               x15 = x15_stack
+       movq    176(%rsp),%r15
+       #                               x10_stack = x10
+       movq    %rbp,168(%rsp)
+       #                                               d = x11 + x15
+       lea     (%r12,%r15),%rbp
+       #                                               (uint32) d <<<= 7
+       rol     $7,%ebp
+       #                                               x3 ^= d
+       xor     %rbp,%rsi
+       #                                               d = x15 + x3
+       lea     (%r15,%rsi),%rbp
+       #                                               (uint32) d <<<= 9
+       rol     $9,%ebp
+       #                                               x7 ^= d
+       xor     %rbp,%r8
+       #                                               d = x3 + x7
+       lea     (%rsi,%r8),%rbp
+       #                                               (uint32) d <<<= 13
+       rol     $13,%ebp
+       #                                               x11 ^= d
+       xor     %rbp,%r12
+       #                                               d = x7 + x11
+       lea     (%r8,%r12),%rbp
+       #                                               (uint32) d <<<= 18
+       rol     $18,%ebp
+       #                                               x15 ^= d
+       xor     %rbp,%r15
+       #                                               x15_stack = x15
+       movq    %r15,176(%rsp)
+       #               x5 = x5_stack
+       movq    160(%rsp),%r15
+       # a = x3 + x0
+       lea     (%rsi,%rdx),%rbp
+       # (uint32) a <<<= 7
+       rol     $7,%ebp
+       # x1 ^= a
+       xor     %rbp,%rdi
+       #               b = x4 + x5
+       lea     (%r9,%r15),%rbp
+       #               (uint32) b <<<= 7
+       rol     $7,%ebp
+       #               x6 ^= b
+       xor     %rbp,%rax
+       # a = x0 + x1
+       lea     (%rdx,%rdi),%rbp
+       # (uint32) a <<<= 9
+       rol     $9,%ebp
+       # x2 ^= a
+       xor     %rbp,%rcx
+       #               b = x5 + x6
+       lea     (%r15,%rax),%rbp
+       #               (uint32) b <<<= 9
+       rol     $9,%ebp
+       #               x7 ^= b
+       xor     %rbp,%r8
+       # a = x1 + x2
+       lea     (%rdi,%rcx),%rbp
+       # (uint32) a <<<= 13
+       rol     $13,%ebp
+       # x3 ^= a
+       xor     %rbp,%rsi
+       #               b = x6 + x7
+       lea     (%rax,%r8),%rbp
+       #               (uint32) b <<<= 13
+       rol     $13,%ebp
+       #               x4 ^= b
+       xor     %rbp,%r9
+       # a = x2 + x3
+       lea     (%rcx,%rsi),%rbp
+       # (uint32) a <<<= 18
+       rol     $18,%ebp
+       # x0 ^= a
+       xor     %rbp,%rdx
+       #               b = x7 + x4
+       lea     (%r8,%r9),%rbp
+       #               (uint32) b <<<= 18
+       rol     $18,%ebp
+       #               x5 ^= b
+       xor     %rbp,%r15
+       #                               x10 = x10_stack
+       movq    168(%rsp),%rbp
+       #               x5_stack = x5
+       movq    %r15,160(%rsp)
+       #                               c = x9 + x10
+       lea     (%r10,%rbp),%r15
+       #                               (uint32) c <<<= 7
+       rol     $7,%r15d
+       #                               x11 ^= c
+       xor     %r15,%r12
+       #                               c = x10 + x11
+       lea     (%rbp,%r12),%r15
+       #                               (uint32) c <<<= 9
+       rol     $9,%r15d
+       #                               x8 ^= c
+       xor     %r15,%r11
+       #                               c = x11 + x8
+       lea     (%r12,%r11),%r15
+       #                               (uint32) c <<<= 13
+       rol     $13,%r15d
+       #                               x9 ^= c
+       xor     %r15,%r10
+       #                               c = x8 + x9
+       lea     (%r11,%r10),%r15
+       #                               (uint32) c <<<= 18
+       rol     $18,%r15d
+       #                               x10 ^= c
+       xor     %r15,%rbp
+       #                                               x15 = x15_stack
+       movq    176(%rsp),%r15
+       #                               x10_stack = x10
+       movq    %rbp,168(%rsp)
+       #                                               d = x14 + x15
+       lea     (%rbx,%r15),%rbp
+       #                                               (uint32) d <<<= 7
+       rol     $7,%ebp
+       #                                               x12 ^= d
+       xor     %rbp,%r14
+       #                                               d = x15 + x12
+       lea     (%r15,%r14),%rbp
+       #                                               (uint32) d <<<= 9
+       rol     $9,%ebp
+       #                                               x13 ^= d
+       xor     %rbp,%r13
+       #                                               d = x12 + x13
+       lea     (%r14,%r13),%rbp
+       #                                               (uint32) d <<<= 13
+       rol     $13,%ebp
+       #                                               x14 ^= d
+       xor     %rbp,%rbx
+       #                                               d = x13 + x14
+       lea     (%r13,%rbx),%rbp
+       #                                               (uint32) d <<<= 18
+       rol     $18,%ebp
+       #                                               x15 ^= d
+       xor     %rbp,%r15
+       #                                               x15_stack = x15
+       movq    %r15,176(%rsp)
+       #               x5 = x5_stack
+       movq    160(%rsp),%r15
+       # a = x12 + x0
+       lea     (%r14,%rdx),%rbp
+       # (uint32) a <<<= 7
+       rol     $7,%ebp
+       # x4 ^= a
+       xor     %rbp,%r9
+       #               b = x1 + x5
+       lea     (%rdi,%r15),%rbp
+       #               (uint32) b <<<= 7
+       rol     $7,%ebp
+       #               x9 ^= b
+       xor     %rbp,%r10
+       # a = x0 + x4
+       lea     (%rdx,%r9),%rbp
+       # (uint32) a <<<= 9
+       rol     $9,%ebp
+       # x8 ^= a
+       xor     %rbp,%r11
+       #               b = x5 + x9
+       lea     (%r15,%r10),%rbp
+       #               (uint32) b <<<= 9
+       rol     $9,%ebp
+       #               x13 ^= b
+       xor     %rbp,%r13
+       # a = x4 + x8
+       lea     (%r9,%r11),%rbp
+       # (uint32) a <<<= 13
+       rol     $13,%ebp
+       # x12 ^= a
+       xor     %rbp,%r14
+       #               b = x9 + x13
+       lea     (%r10,%r13),%rbp
+       #               (uint32) b <<<= 13
+       rol     $13,%ebp
+       #               x1 ^= b
+       xor     %rbp,%rdi
+       # a = x8 + x12
+       lea     (%r11,%r14),%rbp
+       # (uint32) a <<<= 18
+       rol     $18,%ebp
+       # x0 ^= a
+       xor     %rbp,%rdx
+       #               b = x13 + x1
+       lea     (%r13,%rdi),%rbp
+       #               (uint32) b <<<= 18
+       rol     $18,%ebp
+       #               x5 ^= b
+       xor     %rbp,%r15
+       #                               x10 = x10_stack
+       movq    168(%rsp),%rbp
+       #               x5_stack = x5
+       movq    %r15,160(%rsp)
+       #                               c = x6 + x10
+       lea     (%rax,%rbp),%r15
+       #                               (uint32) c <<<= 7
+       rol     $7,%r15d
+       #                               x14 ^= c
+       xor     %r15,%rbx
+       #                               c = x10 + x14
+       lea     (%rbp,%rbx),%r15
+       #                               (uint32) c <<<= 9
+       rol     $9,%r15d
+       #                               x2 ^= c
+       xor     %r15,%rcx
+       #                               c = x14 + x2
+       lea     (%rbx,%rcx),%r15
+       #                               (uint32) c <<<= 13
+       rol     $13,%r15d
+       #                               x6 ^= c
+       xor     %r15,%rax
+       #                               c = x2 + x6
+       lea     (%rcx,%rax),%r15
+       #                               (uint32) c <<<= 18
+       rol     $18,%r15d
+       #                               x10 ^= c
+       xor     %r15,%rbp
+       #                                               x15 = x15_stack
+       movq    176(%rsp),%r15
+       #                               x10_stack = x10
+       movq    %rbp,168(%rsp)
+       #                                               d = x11 + x15
+       lea     (%r12,%r15),%rbp
+       #                                               (uint32) d <<<= 7
+       rol     $7,%ebp
+       #                                               x3 ^= d
+       xor     %rbp,%rsi
+       #                                               d = x15 + x3
+       lea     (%r15,%rsi),%rbp
+       #                                               (uint32) d <<<= 9
+       rol     $9,%ebp
+       #                                               x7 ^= d
+       xor     %rbp,%r8
+       #                                               d = x3 + x7
+       lea     (%rsi,%r8),%rbp
+       #                                               (uint32) d <<<= 13
+       rol     $13,%ebp
+       #                                               x11 ^= d
+       xor     %rbp,%r12
+       #                                               d = x7 + x11
+       lea     (%r8,%r12),%rbp
+       #                                               (uint32) d <<<= 18
+       rol     $18,%ebp
+       #                                               x15 ^= d
+       xor     %rbp,%r15
+       #                                               x15_stack = x15
+       movq    %r15,176(%rsp)
+       #               x5 = x5_stack
+       movq    160(%rsp),%r15
+       # a = x3 + x0
+       lea     (%rsi,%rdx),%rbp
+       # (uint32) a <<<= 7
+       rol     $7,%ebp
+       # x1 ^= a
+       xor     %rbp,%rdi
+       #               b = x4 + x5
+       lea     (%r9,%r15),%rbp
+       #               (uint32) b <<<= 7
+       rol     $7,%ebp
+       #               x6 ^= b
+       xor     %rbp,%rax
+       # a = x0 + x1
+       lea     (%rdx,%rdi),%rbp
+       # (uint32) a <<<= 9
+       rol     $9,%ebp
+       # x2 ^= a
+       xor     %rbp,%rcx
+       #               b = x5 + x6
+       lea     (%r15,%rax),%rbp
+       #               (uint32) b <<<= 9
+       rol     $9,%ebp
+       #               x7 ^= b
+       xor     %rbp,%r8
+       # a = x1 + x2
+       lea     (%rdi,%rcx),%rbp
+       # (uint32) a <<<= 13
+       rol     $13,%ebp
+       # x3 ^= a
+       xor     %rbp,%rsi
+       #               b = x6 + x7
+       lea     (%rax,%r8),%rbp
+       #               (uint32) b <<<= 13
+       rol     $13,%ebp
+       #               x4 ^= b
+       xor     %rbp,%r9
+       # a = x2 + x3
+       lea     (%rcx,%rsi),%rbp
+       # (uint32) a <<<= 18
+       rol     $18,%ebp
+       # x0 ^= a
+       xor     %rbp,%rdx
+       #               b = x7 + x4
+       lea     (%r8,%r9),%rbp
+       #               (uint32) b <<<= 18
+       rol     $18,%ebp
+       #               x5 ^= b
+       xor     %rbp,%r15
+       #                               x10 = x10_stack
+       movq    168(%rsp),%rbp
+       #               x5_stack = x5
+       movq    %r15,160(%rsp)
+       #                               c = x9 + x10
+       lea     (%r10,%rbp),%r15
+       #                               (uint32) c <<<= 7
+       rol     $7,%r15d
+       #                               x11 ^= c
+       xor     %r15,%r12
+       #                               c = x10 + x11
+       lea     (%rbp,%r12),%r15
+       #                               (uint32) c <<<= 9
+       rol     $9,%r15d
+       #                               x8 ^= c
+       xor     %r15,%r11
+       #                               c = x11 + x8
+       lea     (%r12,%r11),%r15
+       #                               (uint32) c <<<= 13
+       rol     $13,%r15d
+       #                               x9 ^= c
+       xor     %r15,%r10
+       #                               c = x8 + x9
+       lea     (%r11,%r10),%r15
+       #                               (uint32) c <<<= 18
+       rol     $18,%r15d
+       #                               x10 ^= c
+       xor     %r15,%rbp
+       #                                               x15 = x15_stack
+       movq    176(%rsp),%r15
+       #                               x10_stack = x10
+       movq    %rbp,168(%rsp)
+       #                                               d = x14 + x15
+       lea     (%rbx,%r15),%rbp
+       #                                               (uint32) d <<<= 7
+       rol     $7,%ebp
+       #                                               x12 ^= d
+       xor     %rbp,%r14
+       #                                               d = x15 + x12
+       lea     (%r15,%r14),%rbp
+       #                                               (uint32) d <<<= 9
+       rol     $9,%ebp
+       #                                               x13 ^= d
+       xor     %rbp,%r13
+       #                                               d = x12 + x13
+       lea     (%r14,%r13),%rbp
+       #                                               (uint32) d <<<= 13
+       rol     $13,%ebp
+       #                                               x14 ^= d
+       xor     %rbp,%rbx
+       #                                               d = x13 + x14
+       lea     (%r13,%rbx),%rbp
+       #                                               (uint32) d <<<= 18
+       rol     $18,%ebp
+       #                                               x15 ^= d
+       xor     %rbp,%r15
+       #                                               x15_stack = x15
+       movq    %r15,176(%rsp)
+       #   i = i_backup
+       movq    184(%rsp),%r15
+       #                  unsigned>? i -= 4
+       sub     $4,%r15
+       # comment:fp stack unchanged by jump
+       # goto mainloop if unsigned>
+       ja      ._mainloop
+       #   (uint32) x2 += j2
+       addl    64(%rsp),%ecx
+       #   x3 <<= 32
+       shl     $32,%rsi
+       #   x3 += j2
+       addq    64(%rsp),%rsi
+       #   (uint64) x3 >>= 32
+       shr     $32,%rsi
+       #   x3 <<= 32
+       shl     $32,%rsi
+       #   x2 += x3
+       add     %rsi,%rcx
+       #   (uint32) x6 += j6
+       addl    80(%rsp),%eax
+       #   x7 <<= 32
+       shl     $32,%r8
+       #   x7 += j6
+       addq    80(%rsp),%r8
+       #   (uint64) x7 >>= 32
+       shr     $32,%r8
+       #   x7 <<= 32
+       shl     $32,%r8
+       #   x6 += x7
+       add     %r8,%rax
+       #   (uint32) x8 += j8
+       addl    88(%rsp),%r11d
+       #   x9 <<= 32
+       shl     $32,%r10
+       #   x9 += j8
+       addq    88(%rsp),%r10
+       #   (uint64) x9 >>= 32
+       shr     $32,%r10
+       #   x9 <<= 32
+       shl     $32,%r10
+       #   x8 += x9
+       add     %r10,%r11
+       #   (uint32) x12 += j12
+       addl    104(%rsp),%r14d
+       #   x13 <<= 32
+       shl     $32,%r13
+       #   x13 += j12
+       addq    104(%rsp),%r13
+       #   (uint64) x13 >>= 32
+       shr     $32,%r13
+       #   x13 <<= 32
+       shl     $32,%r13
+       #   x12 += x13
+       add     %r13,%r14
+       #   (uint32) x0 += j0
+       addl    56(%rsp),%edx
+       #   x1 <<= 32
+       shl     $32,%rdi
+       #   x1 += j0
+       addq    56(%rsp),%rdi
+       #   (uint64) x1 >>= 32
+       shr     $32,%rdi
+       #   x1 <<= 32
+       shl     $32,%rdi
+       #   x0 += x1
+       add     %rdi,%rdx
+       #   x5 = x5_stack
+       movq    160(%rsp),%rdi
+       #   (uint32) x4 += j4
+       addl    72(%rsp),%r9d
+       #   x5 <<= 32
+       shl     $32,%rdi
+       #   x5 += j4
+       addq    72(%rsp),%rdi
+       #   (uint64) x5 >>= 32
+       shr     $32,%rdi
+       #   x5 <<= 32
+       shl     $32,%rdi
+       #   x4 += x5
+       add     %rdi,%r9
+       #   x10 = x10_stack
+       movq    168(%rsp),%r8
+       #   (uint32) x10 += j10
+       addl    96(%rsp),%r8d
+       #   x11 <<= 32
+       shl     $32,%r12
+       #   x11 += j10
+       addq    96(%rsp),%r12
+       #   (uint64) x11 >>= 32
+       shr     $32,%r12
+       #   x11 <<= 32
+       shl     $32,%r12
+       #   x10 += x11
+       add     %r12,%r8
+       #   x15 = x15_stack
+       movq    176(%rsp),%rdi
+       #   (uint32) x14 += j14
+       addl    112(%rsp),%ebx
+       #   x15 <<= 32
+       shl     $32,%rdi
+       #   x15 += j14
+       addq    112(%rsp),%rdi
+       #   (uint64) x15 >>= 32
+       shr     $32,%rdi
+       #   x15 <<= 32
+       shl     $32,%rdi
+       #   x14 += x15
+       add     %rdi,%rbx
+       #   out = out_backup
+       movq    136(%rsp),%rdi
+       #   m = m_backup
+       movq    144(%rsp),%rsi
+       #   x0 ^= *(uint64 *) (m + 0)
+       xorq    0(%rsi),%rdx
+       #   *(uint64 *) (out + 0) = x0
+       movq    %rdx,0(%rdi)
+       #   x2 ^= *(uint64 *) (m + 8)
+       xorq    8(%rsi),%rcx
+       #   *(uint64 *) (out + 8) = x2
+       movq    %rcx,8(%rdi)
+       #   x4 ^= *(uint64 *) (m + 16)
+       xorq    16(%rsi),%r9
+       #   *(uint64 *) (out + 16) = x4
+       movq    %r9,16(%rdi)
+       #   x6 ^= *(uint64 *) (m + 24)
+       xorq    24(%rsi),%rax
+       #   *(uint64 *) (out + 24) = x6
+       movq    %rax,24(%rdi)
+       #   x8 ^= *(uint64 *) (m + 32)
+       xorq    32(%rsi),%r11
+       #   *(uint64 *) (out + 32) = x8
+       movq    %r11,32(%rdi)
+       #   x10 ^= *(uint64 *) (m + 40)
+       xorq    40(%rsi),%r8
+       #   *(uint64 *) (out + 40) = x10
+       movq    %r8,40(%rdi)
+       #   x12 ^= *(uint64 *) (m + 48)
+       xorq    48(%rsi),%r14
+       #   *(uint64 *) (out + 48) = x12
+       movq    %r14,48(%rdi)
+       #   x14 ^= *(uint64 *) (m + 56)
+       xorq    56(%rsi),%rbx
+       #   *(uint64 *) (out + 56) = x14
+       movq    %rbx,56(%rdi)
+       #   bytes = bytes_backup
+       movq    152(%rsp),%rdx
+       #   in8 = j8
+       movq    88(%rsp),%rcx
+       #   in8 += 1
+       add     $1,%rcx
+       #   j8 = in8
+       movq    %rcx,88(%rsp)
+       #                          unsigned>? unsigned<? bytes - 64
+       cmp     $64,%rdx
+       # comment:fp stack unchanged by jump
+       #   goto bytesatleast65 if unsigned>
+       ja      ._bytesatleast65
+       # comment:fp stack unchanged by jump
+       #     goto bytesatleast64 if !unsigned<
+       jae     ._bytesatleast64
+       #       m = out
+       mov     %rdi,%rsi
+       #       out = ctarget
+       movq    128(%rsp),%rdi
+       #       i = bytes
+       mov     %rdx,%rcx
+       #       while (i) { *out++ = *m++; --i }
+       rep     movsb
+       # comment:fp stack unchanged by fallthrough
+#     bytesatleast64:
+._bytesatleast64:
+       #     x = x_backup
+       movq    120(%rsp),%rdi
+       #     in8 = j8
+       movq    88(%rsp),%rsi
+       #     *(uint64 *) (x + 32) = in8
+       movq    %rsi,32(%rdi)
+       #     r11 = r11_stack
+       movq    0(%rsp),%r11
+       #     r12 = r12_stack
+       movq    8(%rsp),%r12
+       #     r13 = r13_stack
+       movq    16(%rsp),%r13
+       #     r14 = r14_stack
+       movq    24(%rsp),%r14
+       #     r15 = r15_stack
+       movq    32(%rsp),%r15
+       #     rbx = rbx_stack
+       movq    40(%rsp),%rbx
+       #     rbp = rbp_stack
+       movq    48(%rsp),%rbp
+       # comment:fp stack unchanged by fallthrough
+#     done:
+._done:
+       #     leave
+       add     %r11,%rsp
+       mov     %rdi,%rax
+       mov     %rsi,%rdx
+       ret
+#   bytesatleast65:
+._bytesatleast65:
+       #   bytes -= 64
+       sub     $64,%rdx
+       #   out += 64
+       add     $64,%rdi
+       #   m += 64
+       add     $64,%rsi
+       # comment:fp stack unchanged by jump
+       # goto bytesatleast1
+       jmp     ._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+       mov     %rsp,%r11
+       and     $31,%r11
+       add     $256,%r11
+       sub     %r11,%rsp
+       #   k = arg2
+       mov     %rsi,%rsi
+       #   kbits = arg3
+       mov     %rdx,%rdx
+       #   x = arg1
+       mov     %rdi,%rdi
+       #   in0 = *(uint64 *) (k + 0)
+       movq    0(%rsi),%r8
+       #   in2 = *(uint64 *) (k + 8)
+       movq    8(%rsi),%r9
+       #   *(uint64 *) (x + 4) = in0
+       movq    %r8,4(%rdi)
+       #   *(uint64 *) (x + 12) = in2
+       movq    %r9,12(%rdi)
+       #                    unsigned<? kbits - 256
+       cmp     $256,%rdx
+       # comment:fp stack unchanged by jump
+       #   goto kbits128 if unsigned<
+       jb      ._kbits128
+#   kbits256:
+._kbits256:
+       #     in10 = *(uint64 *) (k + 16)
+       movq    16(%rsi),%rdx
+       #     in12 = *(uint64 *) (k + 24)
+       movq    24(%rsi),%rsi
+       #     *(uint64 *) (x + 44) = in10
+       movq    %rdx,44(%rdi)
+       #     *(uint64 *) (x + 52) = in12
+       movq    %rsi,52(%rdi)
+       #     in0 = 1634760805
+       mov     $1634760805,%rsi
+       #     in4 = 857760878
+       mov     $857760878,%rdx
+       #     in10 = 2036477234
+       mov     $2036477234,%rcx
+       #     in14 = 1797285236
+       mov     $1797285236,%r8
+       #     *(uint32 *) (x + 0) = in0
+       movl    %esi,0(%rdi)
+       #     *(uint32 *) (x + 20) = in4
+       movl    %edx,20(%rdi)
+       #     *(uint32 *) (x + 40) = in10
+       movl    %ecx,40(%rdi)
+       #     *(uint32 *) (x + 60) = in14
+       movl    %r8d,60(%rdi)
+       # comment:fp stack unchanged by jump
+       #   goto keysetupdone
+       jmp     ._keysetupdone
+#   kbits128:
+._kbits128:
+       #     in10 = *(uint64 *) (k + 0)
+       movq    0(%rsi),%rdx
+       #     in12 = *(uint64 *) (k + 8)
+       movq    8(%rsi),%rsi
+       #     *(uint64 *) (x + 44) = in10
+       movq    %rdx,44(%rdi)
+       #     *(uint64 *) (x + 52) = in12
+       movq    %rsi,52(%rdi)
+       #     in0 = 1634760805
+       mov     $1634760805,%rsi
+       #     in4 = 824206446
+       mov     $824206446,%rdx
+       #     in10 = 2036477238
+       mov     $2036477238,%rcx
+       #     in14 = 1797285236
+       mov     $1797285236,%r8
+       #     *(uint32 *) (x + 0) = in0
+       movl    %esi,0(%rdi)
+       #     *(uint32 *) (x + 20) = in4
+       movl    %edx,20(%rdi)
+       #     *(uint32 *) (x + 40) = in10
+       movl    %ecx,40(%rdi)
+       #     *(uint32 *) (x + 60) = in14
+       movl    %r8d,60(%rdi)
+#   keysetupdone:
+._keysetupdone:
+       # leave
+       add     %r11,%rsp
+       mov     %rdi,%rax
+       mov     %rsi,%rdx
+       ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+       mov     %rsp,%r11
+       and     $31,%r11
+       add     $256,%r11
+       sub     %r11,%rsp
+       #   iv = arg2
+       mov     %rsi,%rsi
+       #   x = arg1
+       mov     %rdi,%rdi
+       #   in6 = *(uint64 *) (iv + 0)
+       movq    0(%rsi),%rsi
+       #   in8 = 0
+       mov     $0,%r8
+       #   *(uint64 *) (x + 24) = in6
+       movq    %rsi,24(%rdi)
+       #   *(uint64 *) (x + 32) = in8
+       movq    %r8,32(%rdi)
+       # leave
+       add     %r11,%rsp
+       mov     %rdi,%rax
+       mov     %rsi,%rdx
+       ret
index 3be4439..bccb76d 100644 (file)
@@ -8,6 +8,8 @@
  * and to remove extraneous comments and functions that are not needed.
  * - i586 version, renamed as salsa20-i586-asm_32.S
  *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
+ * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
index 221356b..b0481f7 100644 (file)
@@ -504,6 +504,21 @@ config CRYPTO_SALSA20_586
          The Salsa20 stream cipher algorithm is designed by Daniel J.
          Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
 
+config CRYPTO_SALSA20_X86_64
+       tristate "Salsa20 stream cipher algorithm (x86_64) (EXPERIMENTAL)"
+       depends on (X86 || UML_X86) && 64BIT
+       depends on EXPERIMENTAL
+       select CRYPTO_BLKCIPHER
+       select CRYPTO_SALSA20
+       help
+         Salsa20 stream cipher algorithm.
+
+         Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
+         Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
+
+         The Salsa20 stream cipher algorithm is designed by Daniel J.
+         Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
+
 config CRYPTO_DEFLATE
        tristate "Deflate compression algorithm"
        select CRYPTO_ALGAPI