Merge branch 'intelfb-patches' of master.kernel.org:/pub/scm/linux/kernel/git/airlied...

[pandora-kernel.git] / arch / x86_64 / lib / memset.S
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S

index 2aa48f2..09ed1f6 100644 (file)
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -1,4 +1,9 @@
  /* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
  /*
   * ISO C memset - set a memory block to a byte value.
   *     
@@ -8,24 +13,122 @@
   * 
   * rax   original destination
   */    
-       .globl __memset
-       .globl memset
-       .p2align 4
-memset:        
-__memset:
+       ALIGN
+memset_c:
+       CFI_STARTPROC
         movq %rdi,%r9
         movl %edx,%r8d
-       andl $7,%r8d            
+       andl $7,%r8d
         movl %edx,%ecx
-       shrl $3,%ecx            
+       shrl $3,%ecx
         /* expand byte value  */
         movzbl %sil,%esi
         movabs $0x0101010101010101,%rax
-       mulq   %rsi             /* with rax, clobbers rdx */
-       rep
-       stosq   
+       mulq %rsi               /* with rax, clobbers rdx */
+       rep stosq
         movl %r8d,%ecx
-       rep
-       stosb
+       rep stosb
         movq %r9,%rax
         ret
+       CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+       CFI_STARTPROC
+       movq %rdi,%r10
+       movq %rdx,%r11
+
+       /* expand byte value  */
+       movzbl %sil,%ecx
+       movabs $0x0101010101010101,%rax
+       mul    %rcx             /* with rax, clobbers rdx */
+
+       /* align dst */
+       movl  %edi,%r9d
+       andl  $7,%r9d
+       jnz  .Lbad_alignment
+       CFI_REMEMBER_STATE
+.Lafter_bad_alignment:
+
+       movl %r11d,%ecx
+       shrl $6,%ecx
+       jz       .Lhandle_tail
+
+       .p2align 4
+.Lloop_64:
+       decl   %ecx
+       movq  %rax,(%rdi)
+       movq  %rax,8(%rdi)
+       movq  %rax,16(%rdi)
+       movq  %rax,24(%rdi)
+       movq  %rax,32(%rdi)
+       movq  %rax,40(%rdi)
+       movq  %rax,48(%rdi)
+       movq  %rax,56(%rdi)
+       leaq  64(%rdi),%rdi
+       jnz    .Lloop_64
+
+       /* Handle tail in loops. The loops should be faster than hard
+          to predict jump tables. */
+       .p2align 4
+.Lhandle_tail:
+       movl    %r11d,%ecx
+       andl    $63&(~7),%ecx
+       jz              .Lhandle_7
+       shrl    $3,%ecx
+       .p2align 4
+.Lloop_8:
+       decl   %ecx
+       movq  %rax,(%rdi)
+       leaq  8(%rdi),%rdi
+       jnz    .Lloop_8
+
+.Lhandle_7:
+       movl    %r11d,%ecx
+       andl    $7,%ecx
+       jz      .Lende
+       .p2align 4
+.Lloop_1:
+       decl    %ecx
+       movb    %al,(%rdi)
+       leaq    1(%rdi),%rdi
+       jnz     .Lloop_1
+
+.Lende:
+       movq    %r10,%rax
+       ret
+
+       CFI_RESTORE_STATE
+.Lbad_alignment:
+       cmpq $7,%r11
+       jbe     .Lhandle_7
+       movq %rax,(%rdi)        /* unaligned store */
+       movq $8,%r8
+       subq %r9,%r8
+       addq %r8,%rdi
+       subq %r8,%r11
+       jmp .Lafter_bad_alignment
+.Lfinal:
+       CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
+
+       /* Some CPUs run faster using the string instructions.
+          It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+       .section .altinstr_replacement,"ax"
+1:     .byte 0xeb                              /* jmp <disp8> */
+       .byte (memset_c - memset) - (2f - 1b)   /* offset */
+2:
+       .previous
+       .section .altinstructions,"a"
+       .align 8
+       .quad memset
+       .quad 1b
+       .byte X86_FEATURE_REP_GOOD
+       .byte .Lfinal - memset
+       .byte 2b - 1b
+       .previous