/* Copyright 2002 Andi Kleen */ #include #include #include #include /* * memcpy - Copy a memory block. * * Input: * rdi destination * rsi source * rdx count * * Output: * rax original destination */ ALIGN memcpy_c: CFI_STARTPROC movq %rdi,%rax movl %edx,%ecx shrl $3,%ecx andl $7,%edx rep movsq movl %edx,%ecx rep movsb ret CFI_ENDPROC ENDPROC(memcpy_c) ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC pushq %rbx CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rbx, 0 movq %rdi,%rax movl %edx,%ecx shrl $6,%ecx jz .Lhandle_tail .p2align 4 .Lloop_64: decl %ecx movq (%rsi),%r11 movq 8(%rsi),%r8 movq %r11,(%rdi) movq %r8,1*8(%rdi) movq 2*8(%rsi),%r9 movq 3*8(%rsi),%r10 movq %r9,2*8(%rdi) movq %r10,3*8(%rdi) movq 4*8(%rsi),%r11 movq 5*8(%rsi),%r8 movq %r11,4*8(%rdi) movq %r8,5*8(%rdi) movq 6*8(%rsi),%r9 movq 7*8(%rsi),%r10 movq %r9,6*8(%rdi) movq %r10,7*8(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi jnz .Lloop_64 .Lhandle_tail: movl %edx,%ecx andl $63,%ecx shrl $3,%ecx jz .Lhandle_7 .p2align 4 .Lloop_8: decl %ecx movq (%rsi),%r8 movq %r8,(%rdi) leaq 8(%rdi),%rdi leaq 8(%rsi),%rsi jnz .Lloop_8 .Lhandle_7: movl %edx,%ecx andl $7,%ecx jz .Lende .p2align 4 .Lloop_1: movb (%rsi),%r8b movb %r8b,(%rdi) incq %rdi incq %rsi decl %ecx jnz .Lloop_1 .Lende: popq %rbx CFI_ADJUST_CFA_OFFSET -8 CFI_RESTORE rbx ret .Lfinal: CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp */ .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ 2: .previous .section .altinstructions,"a" .align 8 .quad memcpy .quad 1b .byte X86_FEATURE_REP_GOOD .byte .Lfinal - memcpy .byte 2b - 1b .previous