arch/x86/lib/memmove_64.S

   1 /*
   2  * Normally compiler builtins are used, but sometimes the compiler calls out
   3  * of line code. Based on asm-i386/string.h.
   4  *
   5  * This assembly file is re-written from memmove_64.c file.
   6  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   7  */
   8 #define _STRING_C
   9 #include <linux/linkage.h>
  10 #include <asm/dwarf2.h>
  11 #include <asm/cpufeature.h>
  12
  13 #undef memmove
  14
  15 /*
  16  * Implement memmove(). This can handle overlap between src and dst.
  17  *
  18  * Input:
  19  * rdi: dest
  20  * rsi: src
  21  * rdx: count
  22  *
  23  * Output:
  24  * rax: dest
  25  */
  26 ENTRY(memmove)
  27         CFI_STARTPROC
  28
  29         /* Handle more 32bytes in loop */
  30         mov %rdi, %rax
  31         cmp $0x20, %rdx
  32         jb      1f
  33
  34         /* Decide forward/backward copy mode */
  35         cmp %rdi, %rsi
  36         jge .Lmemmove_begin_forward
  37         mov %rsi, %r8
  38         add %rdx, %r8
  39         cmp %rdi, %r8
  40         jg 2f
  41
  42 .Lmemmove_begin_forward:
  43         /*
  44          * movsq instruction have many startup latency
  45          * so we handle small size by general register.
  46          */
  47         cmp  $680, %rdx
  48         jb      3f
  49         /*
  50          * movsq instruction is only good for aligned case.
  51          */
  52
  53         cmpb %dil, %sil
  54         je 4f
  55 3:
  56         sub $0x20, %rdx
  57         /*
  58          * We gobble 32byts forward in each loop.
  59          */
  60 5:
  61         sub $0x20, %rdx
  62         movq 0*8(%rsi), %r11
  63         movq 1*8(%rsi), %r10
  64         movq 2*8(%rsi), %r9
  65         movq 3*8(%rsi), %r8
  66         leaq 4*8(%rsi), %rsi
  67
  68         movq %r11, 0*8(%rdi)
  69         movq %r10, 1*8(%rdi)
  70         movq %r9, 2*8(%rdi)
  71         movq %r8, 3*8(%rdi)
  72         leaq 4*8(%rdi), %rdi
  73         jae 5b
  74         addq $0x20, %rdx
  75         jmp 1f
  76         /*
  77          * Handle data forward by movsq.
  78          */
  79         .p2align 4
  80 4:
  81         movq %rdx, %rcx
  82         movq -8(%rsi, %rdx), %r11
  83         lea -8(%rdi, %rdx), %r10
  84         shrq $3, %rcx
  85         rep movsq
  86         movq %r11, (%r10)
  87         jmp 13f
  88 .Lmemmove_end_forward:
  89
  90         /*
  91          * Handle data backward by movsq.
  92          */
  93         .p2align 4
  94 7:
  95         movq %rdx, %rcx
  96         movq (%rsi), %r11
  97         movq %rdi, %r10
  98         leaq -8(%rsi, %rdx), %rsi
  99         leaq -8(%rdi, %rdx), %rdi
 100         shrq $3, %rcx
 101         std
 102         rep movsq
 103         cld
 104         movq %r11, (%r10)
 105         jmp 13f
 106
 107         /*
 108          * Start to prepare for backward copy.
 109          */
 110         .p2align 4
 111 2:
 112         cmp $680, %rdx
 113         jb 6f
 114         cmp %dil, %sil
 115         je 7b
 116 6:
 117         /*
 118          * Calculate copy position to tail.
 119          */
 120         addq %rdx, %rsi
 121         addq %rdx, %rdi
 122         subq $0x20, %rdx
 123         /*
 124          * We gobble 32byts backward in each loop.
 125          */
 126 8:
 127         subq $0x20, %rdx
 128         movq -1*8(%rsi), %r11
 129         movq -2*8(%rsi), %r10
 130         movq -3*8(%rsi), %r9
 131         movq -4*8(%rsi), %r8
 132         leaq -4*8(%rsi), %rsi
 133
 134         movq %r11, -1*8(%rdi)
 135         movq %r10, -2*8(%rdi)
 136         movq %r9, -3*8(%rdi)
 137         movq %r8, -4*8(%rdi)
 138         leaq -4*8(%rdi), %rdi
 139         jae 8b
 140         /*
 141          * Calculate copy position to head.
 142          */
 143         addq $0x20, %rdx
 144         subq %rdx, %rsi
 145         subq %rdx, %rdi
 146 1:
 147         cmpq $16, %rdx
 148         jb 9f
 149         /*
 150          * Move data from 16 bytes to 31 bytes.
 151          */
 152         movq 0*8(%rsi), %r11
 153         movq 1*8(%rsi), %r10
 154         movq -2*8(%rsi, %rdx), %r9
 155         movq -1*8(%rsi, %rdx), %r8
 156         movq %r11, 0*8(%rdi)
 157         movq %r10, 1*8(%rdi)
 158         movq %r9, -2*8(%rdi, %rdx)
 159         movq %r8, -1*8(%rdi, %rdx)
 160         jmp 13f
 161         .p2align 4
 162 9:
 163         cmpq $8, %rdx
 164         jb 10f
 165         /*
 166          * Move data from 8 bytes to 15 bytes.
 167          */
 168         movq 0*8(%rsi), %r11
 169         movq -1*8(%rsi, %rdx), %r10
 170         movq %r11, 0*8(%rdi)
 171         movq %r10, -1*8(%rdi, %rdx)
 172         jmp 13f
 173 10:
 174         cmpq $4, %rdx
 175         jb 11f
 176         /*
 177          * Move data from 4 bytes to 7 bytes.
 178          */
 179         movl (%rsi), %r11d
 180         movl -4(%rsi, %rdx), %r10d
 181         movl %r11d, (%rdi)
 182         movl %r10d, -4(%rdi, %rdx)
 183         jmp 13f
 184 11:
 185         cmp $2, %rdx
 186         jb 12f
 187         /*
 188          * Move data from 2 bytes to 3 bytes.
 189          */
 190         movw (%rsi), %r11w
 191         movw -2(%rsi, %rdx), %r10w
 192         movw %r11w, (%rdi)
 193         movw %r10w, -2(%rdi, %rdx)
 194         jmp 13f
 195 12:
 196         cmp $1, %rdx
 197         jb 13f
 198         /*
 199          * Move data for 1 byte.
 200          */
 201         movb (%rsi), %r11b
 202         movb %r11b, (%rdi)
 203 13:
 204         retq
 205         CFI_ENDPROC
 206
 207         .section .altinstr_replacement,"ax"
 208 .Lmemmove_begin_forward_efs:
 209         /* Forward moving data. */
 210         movq %rdx, %rcx
 211         rep movsb
 212         retq
 213 .Lmemmove_end_forward_efs:
 214         .previous
 215
 216         .section .altinstructions,"a"
 217         .align 8
 218         .quad .Lmemmove_begin_forward
 219         .quad .Lmemmove_begin_forward_efs
 220         .word X86_FEATURE_ERMS
 221         .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
 222         .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 223         .previous
 224 ENDPROC(memmove)