x86: memcpy, clean up

author Ingo Molnar <mingo@elte.hu>

Thu, 12 Mar 2009 11:20:17 +0000 (12:20 +0100)

committer Ingo Molnar <mingo@elte.hu>

Thu, 12 Mar 2009 11:21:17 +0000 (12:21 +0100)
author Ingo Molnar <mingo@elte.hu>
Thu, 12 Mar 2009 11:20:17 +0000 (12:20 +0100)
committer Ingo Molnar <mingo@elte.hu>
Thu, 12 Mar 2009 11:21:17 +0000 (12:21 +0100)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S

index 10c0676..ad5441e 100644 (file)
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
  /* Copyright 2002 Andi Kleen */
  
  #include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
  #include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
  
  /*
   * memcpy - Copy a memory block.
   *
- * Input:      
- * rdi destination
- * rsi source
- * rdx count
- * 
+ * Input:
+ *  rdi destination
+ *  rsi source
+ *  rdx count
+ *
   * Output:
   * rax original destination
- */    
+ */
  
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
         ALIGN
  memcpy_c:
         CFI_STARTPROC
-       movq %rdi,%rax
-       movl %edx,%ecx
-       shrl $3,%ecx
-       andl $7,%edx
+       movq %rdi, %rax
+
+       movl %edx, %ecx
+       shrl $3, %ecx
+       andl $7, %edx
         rep movsq
-       movl %edx,%ecx
+       movl %edx, %ecx
         rep movsb
         ret
         CFI_ENDPROC
@@ -33,92 +41,110 @@ ENDPROC(memcpy_c)
  ENTRY(__memcpy)
  ENTRY(memcpy)
         CFI_STARTPROC
-       movq %rdi,%rax
  
-       movl %edx,%ecx
-       shrl $6,%ecx
+       /*
+        * Put the number of full 64-byte blocks into %ecx.
+        * Tail portion is handled at the end:
+        */
+       movq %rdi, %rax
+       movl %edx, %ecx
+       shrl   $6, %ecx
         jz .Lhandle_tail
  
         .p2align 4
  .Lloop_64:
+       /*
+        * We decrement the loop index here - and the zero-flag is
+        * checked at the end of the loop (instructions inbetween do
+        * not change the zero flag):
+        */
         decl %ecx
  
-       movq (%rsi),%r11
-       movq 8(%rsi),%r8
+       /*
+        * Move in blocks of 4x16 bytes:
+        */
+       movq 0*8(%rsi),         %r11
+       movq 1*8(%rsi),         %r8
+       movq %r11,              0*8(%rdi)
+       movq %r8,               1*8(%rdi)
  
-       movq %r11,(%rdi)
-       movq %r8,1*8(%rdi)
+       movq 2*8(%rsi),         %r9
+       movq 3*8(%rsi),         %r10
+       movq %r9,               2*8(%rdi)
+       movq %r10,              3*8(%rdi)
  
-       movq 2*8(%rsi),%r9
-       movq 3*8(%rsi),%r10
+       movq 4*8(%rsi),         %r11
+       movq 5*8(%rsi),         %r8
+       movq %r11,              4*8(%rdi)
+       movq %r8,               5*8(%rdi)
  
-       movq %r9,2*8(%rdi)
-       movq %r10,3*8(%rdi)
+       movq 6*8(%rsi),         %r9
+       movq 7*8(%rsi),         %r10
+       movq %r9,               6*8(%rdi)
+       movq %r10,              7*8(%rdi)
  
-       movq 4*8(%rsi),%r11
-       movq 5*8(%rsi),%r8
+       leaq 64(%rsi), %rsi
+       leaq 64(%rdi), %rdi
  
-       movq %r11,4*8(%rdi)
-       movq %r8,5*8(%rdi)
-
-       movq 6*8(%rsi),%r9
-       movq 7*8(%rsi),%r10
-
-       movq %r9,6*8(%rdi)
-       movq %r10,7*8(%rdi)
-
-       leaq 64(%rsi),%rsi
-       leaq 64(%rdi),%rdi
         jnz  .Lloop_64
  
  .Lhandle_tail:
-       movl %edx,%ecx
-       andl $63,%ecx
-       shrl $3,%ecx
+       movl %edx, %ecx
+       andl  $63, %ecx
+       shrl   $3, %ecx
         jz   .Lhandle_7
+
         .p2align 4
  .Lloop_8:
         decl %ecx
-       movq (%rsi),%r8
-       movq %r8,(%rdi)
-       leaq 8(%rdi),%rdi
-       leaq 8(%rsi),%rsi
+       movq (%rsi),            %r8
+       movq %r8,               (%rdi)
+       leaq 8(%rdi),           %rdi
+       leaq 8(%rsi),           %rsi
         jnz  .Lloop_8
  
  .Lhandle_7:
-       movl %edx,%ecx
-       andl $7,%ecx
-       jz .Lende
+       movl %edx, %ecx
+       andl $7, %ecx
+       jz .Lend
+
         .p2align 4
  .Lloop_1:
-       movb (%rsi),%r8b
-       movb %r8b,(%rdi)
+       movb (%rsi), %r8b
+       movb %r8b, (%rdi)
         incq %rdi
         incq %rsi
         decl %ecx
         jnz .Lloop_1
  
-.Lende:
+.Lend:
         ret
         CFI_ENDPROC
  ENDPROC(memcpy)
  ENDPROC(__memcpy)
  
-       /* Some CPUs run faster using the string copy instructions.
-          It is also a lot simpler. Use this when possible */
+       /*
+        * Some CPUs run faster using the string copy instructions.
+        * It is also a lot simpler. Use this when possible:
+        */
  
-       .section .altinstr_replacement,"ax"
+       .section .altinstr_replacement, "ax"
  1:     .byte 0xeb                              /* jmp <disp8> */
         .byte (memcpy_c - memcpy) - (2f - 1b)   /* offset */
  2:
         .previous
-       .section .altinstructions,"a"
+
+       .section .altinstructions, "a"
         .align 8
         .quad memcpy
         .quad 1b
         .byte X86_FEATURE_REP_GOOD
-       /* Replace only beginning, memcpy is used to apply alternatives, so it
-        * is silly to overwrite itself with nops - reboot is only outcome... */
+
+       /*
+        * Replace only beginning, memcpy is used to apply alternatives,
+        * so it is silly to overwrite itself with nops - reboot is the
+        * only outcome...
+        */
         .byte 2b - 1b
         .byte 2b - 1b
         .previous
author	Ingo Molnar <mingo@elte.hu>
	Thu, 12 Mar 2009 11:20:17 +0000 (12:20 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 12 Mar 2009 11:21:17 +0000 (12:21 +0100)