arch/sh/lib64/copy_user_memcpy.S

   1 !
   2 ! Fast SH memcpy
   3 !
   4 ! by Toshiyasu Morita (tm@netcom.com)
   5 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   6 ! SH5 code Copyright 2002 SuperH Ltd.
   7 !
   8 ! Entry: ARG0: destination pointer
   9 !        ARG1: source pointer
  10 !        ARG2: byte count
  11 !
  12 ! Exit:  RESULT: destination pointer
  13 !        any other registers in the range r0-r7: trashed
  14 !
  15 ! Notes: Usually one wants to do small reads and write a longword, but
  16 !        unfortunately it is difficult in some cases to concatanate bytes
  17 !        into a longword on the SH, so this does a longword read and small
  18 !        writes.
  19 !
  20 ! This implementation makes two assumptions about how it is called:
  21 !
  22 ! 1.: If the byte count is nonzero, the address of the last byte to be
  23 !     copied is unsigned greater than the address of the first byte to
  24 !     be copied.  This could be easily swapped for a signed comparison,
  25 !     but the algorithm used needs some comparison.
  26 !
  27 ! 2.: When there are two or three bytes in the last word of an 11-or-more
  28 !     bytes memory chunk to b copied, the rest of the word can be read
  29 !     without side effects.
  30 !     This could be easily changed by increasing the minimum size of
  31 !     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  32 !     however, this would cost a few extra cyles on average.
  33 !     For SHmedia, the assumption is that any quadword can be read in its
  34 !     enirety if at least one byte is included in the copy.
  35
  36 /* Imported into Linux kernel by Richard Curnow.  This is used to implement the
  37    __copy_user function in the general case, so it has to be a distinct
  38    function from intra-kernel memcpy to allow for exception fix-ups in the
  39    event that the user pointer is bad somewhere in the copy (e.g. due to
  40    running off the end of the vma).
  41
  42    Note, this algorithm will be slightly wasteful in the case where the source
  43    and destination pointers are equally aligned, because the stlo/sthi pairs
  44    could then be merged back into single stores.  If there are a lot of cache
  45    misses, this is probably offset by the stall lengths on the preloads.
  46
  47 */
  48
  49 /* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
  50  * erratum.  The first two prefetches are nop-ed out to avoid upsetting the
  51  * instruction counts used in the jump address calculation.
  52  * */
  53
  54         .section .text..SHmedia32,"ax"
  55         .little
  56         .balign 32
  57         .global copy_user_memcpy
  58         .global copy_user_memcpy_end
  59 copy_user_memcpy:
  60
  61 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  62 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  63 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  64 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  65
  66         nop ! ld.b r3,0,r63 ! TAKum03020
  67         pta/l Large,tr0
  68         movi 25,r0
  69         bgeu/u r4,r0,tr0
  70         nsb r4,r0
  71         shlli r0,5,r0
  72         movi (L1-L0+63*32 + 1) & 0xffff,r1
  73         sub r1, r0, r0
  74 L0:     ptrel r0,tr0
  75         add r2,r4,r5
  76         ptabs r18,tr1
  77         add r3,r4,r6
  78         blink tr0,r63
  79
  80 /* Rearranged to make cut2 safe */
  81         .balign 8
  82 L4_7:   /* 4..7 byte memcpy cntd. */
  83         stlo.l r2, 0, r0
  84         or r6, r7, r6
  85         sthi.l r5, -1, r6
  86         stlo.l r5, -4, r6
  87         blink tr1,r63
  88
  89         .balign 8
  90 L1:     /* 0 byte memcpy */
  91         nop
  92         blink tr1,r63
  93         nop
  94         nop
  95         nop
  96         nop
  97
  98 L2_3:   /* 2 or 3 byte memcpy cntd. */
  99         st.b r5,-1,r6
 100         blink tr1,r63
 101
 102         /* 1 byte memcpy */
 103         ld.b r3,0,r0
 104         st.b r2,0,r0
 105         blink tr1,r63
 106
 107 L8_15:  /* 8..15 byte memcpy cntd. */
 108         stlo.q r2, 0, r0
 109         or r6, r7, r6
 110         sthi.q r5, -1, r6
 111         stlo.q r5, -8, r6
 112         blink tr1,r63
 113
 114         /* 2 or 3 byte memcpy */
 115         ld.b r3,0,r0
 116         nop ! ld.b r2,0,r63 ! TAKum03020
 117         ld.b r3,1,r1
 118         st.b r2,0,r0
 119         pta/l L2_3,tr0
 120         ld.b r6,-1,r6
 121         st.b r2,1,r1
 122         blink tr0, r63
 123
 124         /* 4 .. 7 byte memcpy */
 125         LDUAL (r3, 0, r0, r1)
 126         pta L4_7, tr0
 127         ldlo.l r6, -4, r7
 128         or r0, r1, r0
 129         sthi.l r2, 3, r0
 130         ldhi.l r6, -1, r6
 131         blink tr0, r63
 132
 133         /* 8 .. 15 byte memcpy */
 134         LDUAQ (r3, 0, r0, r1)
 135         pta L8_15, tr0
 136         ldlo.q r6, -8, r7
 137         or r0, r1, r0
 138         sthi.q r2, 7, r0
 139         ldhi.q r6, -1, r6
 140         blink tr0, r63
 141
 142         /* 16 .. 24 byte memcpy */
 143         LDUAQ (r3, 0, r0, r1)
 144         LDUAQ (r3, 8, r8, r9)
 145         or r0, r1, r0
 146         sthi.q r2, 7, r0
 147         or r8, r9, r8
 148         sthi.q r2, 15, r8
 149         ldlo.q r6, -8, r7
 150         ldhi.q r6, -1, r6
 151         stlo.q r2, 8, r8
 152         stlo.q r2, 0, r0
 153         or r6, r7, r6
 154         sthi.q r5, -1, r6
 155         stlo.q r5, -8, r6
 156         blink tr1,r63
 157
 158 Large:
 159         ! ld.b r2, 0, r63 ! TAKum03020
 160         pta/l  Loop_ua, tr1
 161         ori r3, -8, r7
 162         sub r2, r7, r22
 163         sub r3, r2, r6
 164         add r2, r4, r5
 165         ldlo.q r3, 0, r0
 166         addi r5, -16, r5
 167         movi 64+8, r27 ! could subtract r7 from that.
 168         stlo.q r2, 0, r0
 169         sthi.q r2, 7, r0
 170         ldx.q r22, r6, r0
 171         bgtu/l r27, r4, tr1
 172
 173         addi r5, -48, r27
 174         pta/l Loop_line, tr0
 175         addi r6, 64, r36
 176         addi r6, -24, r19
 177         addi r6, -16, r20
 178         addi r6, -8, r21
 179
 180 Loop_line:
 181         ! ldx.q r22, r36, r63 ! TAKum03020
 182         alloco r22, 32
 183         synco
 184         addi r22, 32, r22
 185         ldx.q r22, r19, r23
 186         sthi.q r22, -25, r0
 187         ldx.q r22, r20, r24
 188         ldx.q r22, r21, r25
 189         stlo.q r22, -32, r0
 190         ldx.q r22, r6,  r0
 191         sthi.q r22, -17, r23
 192         sthi.q r22,  -9, r24
 193         sthi.q r22,  -1, r25
 194         stlo.q r22, -24, r23
 195         stlo.q r22, -16, r24
 196         stlo.q r22,  -8, r25
 197         bgeu r27, r22, tr0
 198
 199 Loop_ua:
 200         addi r22, 8, r22
 201         sthi.q r22, -1, r0
 202         stlo.q r22, -8, r0
 203         ldx.q r22, r6, r0
 204         bgtu/l r5, r22, tr1
 205
 206         add r3, r4, r7
 207         ldlo.q r7, -8, r1
 208         sthi.q r22, 7, r0
 209         ldhi.q r7, -1, r7
 210         ptabs r18,tr1
 211         stlo.q r22, 0, r0
 212         or r1, r7, r1
 213         sthi.q r5, 15, r1
 214         stlo.q r5, 8, r1
 215         blink tr1, r63
 216 copy_user_memcpy_end:
 217         nop