arch/mips/lib/memcpy.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  *
  13  * Mnemonic names for arguments to memcpy/__copy_user
  14  */
  15 #include <linux/config.h>
  16
  17 /*
  18  * Hack to resolve longstanding prefetch issue
  19  *
  20  * Prefetching may be fatal on some systems if we're prefetching beyond the
  21  * end of memory on some systems.  It's also a seriously bad idea on non
  22  * dma-coherent systems.
  23  */
  24 #if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
  25 #undef CONFIG_CPU_HAS_PREFETCH
  26 #endif
  27 #ifdef CONFIG_MIPS_MALTA
  28 #undef CONFIG_CPU_HAS_PREFETCH
  29 #endif
  30
  31 #include <asm/asm.h>
  32 #include <asm/asm-offsets.h>
  33 #include <asm/regdef.h>
  34
  35 #define dst a0
  36 #define src a1
  37 #define len a2
  38
  39 /*
  40  * Spec
  41  *
  42  * memcpy copies len bytes from src to dst and sets v0 to dst.
  43  * It assumes that
  44  *   - src and dst don't overlap
  45  *   - src is readable
  46  *   - dst is writable
  47  * memcpy uses the standard calling convention
  48  *
  49  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  50  * the number of uncopied bytes due to an exception caused by a read or write.
  51  * __copy_user assumes that src and dst don't overlap, and that the call is
  52  * implementing one of the following:
  53  *   copy_to_user
  54  *     - src is readable  (no exceptions when reading src)
  55  *   copy_from_user
  56  *     - dst is writable  (no exceptions when writing dst)
  57  * __copy_user uses a non-standard calling convention; see
  58  * include/asm-mips/uaccess.h
  59  *
  60  * When an exception happens on a load, the handler must
  61  # ensure that all of the destination buffer is overwritten to prevent
  62  * leaking information to user mode programs.
  63  */
  64
  65 /*
  66  * Implementation
  67  */
  68
  69 /*
  70  * The exception handler for loads requires that:
  71  *  1- AT contain the address of the byte just past the end of the source
  72  *     of the copy,
  73  *  2- src_entry <= src < AT, and
  74  *  3- (dst - src) == (dst_entry - src_entry),
  75  * The _entry suffix denotes values when __copy_user was called.
  76  *
  77  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  78  * (2) is met by incrementing src by the number of bytes copied
  79  * (3) is met by not doing loads between a pair of increments of dst and src
  80  *
  81  * The exception handlers for stores adjust len (if necessary) and return.
  82  * These handlers do not need to overwrite any data.
  83  *
  84  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  85  * they're not protected.
  86  */
  87
  88 #define EXC(inst_reg,addr,handler)              \
  89 9:      inst_reg, addr;                         \
  90         .section __ex_table,"a";                \
  91         PTR     9b, handler;                    \
  92         .previous
  93
  94 /*
  95  * Only on the 64-bit kernel we can made use of 64-bit registers.
  96  */
  97 #ifdef CONFIG_64BIT
  98 #define USE_DOUBLE
  99 #endif
 100
 101 #ifdef USE_DOUBLE
 102
 103 #define LOAD   ld
 104 #define LOADL  ldl
 105 #define LOADR  ldr
 106 #define STOREL sdl
 107 #define STORER sdr
 108 #define STORE  sd
 109 #define ADD    daddu
 110 #define SUB    dsubu
 111 #define SRL    dsrl
 112 #define SRA    dsra
 113 #define SLL    dsll
 114 #define SLLV   dsllv
 115 #define SRLV   dsrlv
 116 #define NBYTES 8
 117 #define LOG_NBYTES 3
 118
 119 /*
 120  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 121  * register definitions). We need to redefine the register definitions from
 122  * the n64 ABI register naming to the o32 ABI register naming.
 123  */
 124 #undef t0
 125 #undef t1
 126 #undef t2
 127 #undef t3
 128 #define t0      $8
 129 #define t1      $9
 130 #define t2      $10
 131 #define t3      $11
 132 #define t4      $12
 133 #define t5      $13
 134 #define t6      $14
 135 #define t7      $15
 136
 137 #else
 138
 139 #define LOAD   lw
 140 #define LOADL  lwl
 141 #define LOADR  lwr
 142 #define STOREL swl
 143 #define STORER swr
 144 #define STORE  sw
 145 #define ADD    addu
 146 #define SUB    subu
 147 #define SRL    srl
 148 #define SLL    sll
 149 #define SRA    sra
 150 #define SLLV   sllv
 151 #define SRLV   srlv
 152 #define NBYTES 4
 153 #define LOG_NBYTES 2
 154
 155 #endif /* USE_DOUBLE */
 156
 157 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 158 #define LDFIRST LOADR
 159 #define LDREST  LOADL
 160 #define STFIRST STORER
 161 #define STREST  STOREL
 162 #define SHIFT_DISCARD SLLV
 163 #else
 164 #define LDFIRST LOADL
 165 #define LDREST  LOADR
 166 #define STFIRST STOREL
 167 #define STREST  STORER
 168 #define SHIFT_DISCARD SRLV
 169 #endif
 170
 171 #define FIRST(unit) ((unit)*NBYTES)
 172 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 173 #define UNIT(unit)  FIRST(unit)
 174
 175 #define ADDRMASK (NBYTES-1)
 176
 177         .text
 178         .set    noreorder
 179         .set    noat
 180
 181 /*
 182  * A combined memcpy/__copy_user
 183  * __copy_user sets len to 0 for success; else to an upper bound of
 184  * the number of uncopied bytes.
 185  * memcpy sets v0 to dst.
 186  */
 187         .align  5
 188 LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 189         move    v0, dst                         /* return value */
 190 __memcpy:
 191 FEXPORT(__copy_user)
 192         /*
 193          * Note: dst & src may be unaligned, len may be 0
 194          * Temps
 195          */
 196 #define rem t8
 197
 198         /*
 199          * The "issue break"s below are very approximate.
 200          * Issue delays for dcache fills will perturb the schedule, as will
 201          * load queue full replay traps, etc.
 202          *
 203          * If len < NBYTES use byte operations.
 204          */
 205         PREF(   0, 0(src) )
 206         PREF(   1, 0(dst) )
 207         sltu    t2, len, NBYTES
 208         and     t1, dst, ADDRMASK
 209         PREF(   0, 1*32(src) )
 210         PREF(   1, 1*32(dst) )
 211         bnez    t2, copy_bytes_checklen
 212          and    t0, src, ADDRMASK
 213         PREF(   0, 2*32(src) )
 214         PREF(   1, 2*32(dst) )
 215         bnez    t1, dst_unaligned
 216          nop
 217         bnez    t0, src_unaligned_dst_aligned
 218         /*
 219          * use delay slot for fall-through
 220          * src and dst are aligned; need to compute rem
 221          */
 222 both_aligned:
 223          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 224         beqz    t0, cleanup_both_aligned # len < 8*NBYTES
 225          and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 226         PREF(   0, 3*32(src) )
 227         PREF(   1, 3*32(dst) )
 228         .align  4
 229 1:
 230 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 231 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 232 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 233 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 234         SUB     len, len, 8*NBYTES
 235 EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
 236 EXC(    LOAD    t7, UNIT(5)(src),       l_exc_copy)
 237 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p8u)
 238 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p7u)
 239 EXC(    LOAD    t0, UNIT(6)(src),       l_exc_copy)
 240 EXC(    LOAD    t1, UNIT(7)(src),       l_exc_copy)
 241         ADD     src, src, 8*NBYTES
 242         ADD     dst, dst, 8*NBYTES
 243 EXC(    STORE   t2, UNIT(-6)(dst),      s_exc_p6u)
 244 EXC(    STORE   t3, UNIT(-5)(dst),      s_exc_p5u)
 245 EXC(    STORE   t4, UNIT(-4)(dst),      s_exc_p4u)
 246 EXC(    STORE   t7, UNIT(-3)(dst),      s_exc_p3u)
 247 EXC(    STORE   t0, UNIT(-2)(dst),      s_exc_p2u)
 248 EXC(    STORE   t1, UNIT(-1)(dst),      s_exc_p1u)
 249         PREF(   0, 8*32(src) )
 250         PREF(   1, 8*32(dst) )
 251         bne     len, rem, 1b
 252          nop
 253
 254         /*
 255          * len == rem == the number of bytes left to copy < 8*NBYTES
 256          */
 257 cleanup_both_aligned:
 258         beqz    len, done
 259          sltu   t0, len, 4*NBYTES
 260         bnez    t0, less_than_4units
 261          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 262         /*
 263          * len >= 4*NBYTES
 264          */
 265 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 266 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 267 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 268 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 269         SUB     len, len, 4*NBYTES
 270         ADD     src, src, 4*NBYTES
 271 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 272 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 273 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 274 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 275         beqz    len, done
 276          ADD    dst, dst, 4*NBYTES
 277 less_than_4units:
 278         /*
 279          * rem = len % NBYTES
 280          */
 281         beq     rem, len, copy_bytes
 282          nop
 283 1:
 284 EXC(    LOAD    t0, 0(src),             l_exc)
 285         ADD     src, src, NBYTES
 286         SUB     len, len, NBYTES
 287 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 288         bne     rem, len, 1b
 289          ADD    dst, dst, NBYTES
 290
 291         /*
 292          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 293          * A loop would do only a byte at a time with possible branch
 294          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 295          * because can't assume read-access to dst.  Instead, use
 296          * STREST dst, which doesn't require read access to dst.
 297          *
 298          * This code should perform better than a simple loop on modern,
 299          * wide-issue mips processors because the code has fewer branches and
 300          * more instruction-level parallelism.
 301          */
 302 #define bits t2
 303         beqz    len, done
 304          ADD    t1, dst, len    # t1 is just past last byte of dst
 305         li      bits, 8*NBYTES
 306         SLL     rem, len, 3     # rem = number of bits to keep
 307 EXC(    LOAD    t0, 0(src),             l_exc)
 308         SUB     bits, bits, rem # bits = number of bits to discard
 309         SHIFT_DISCARD t0, t0, bits
 310 EXC(    STREST  t0, -1(t1),             s_exc)
 311         jr      ra
 312          move   len, zero
 313 dst_unaligned:
 314         /*
 315          * dst is unaligned
 316          * t0 = src & ADDRMASK
 317          * t1 = dst & ADDRMASK; T1 > 0
 318          * len >= NBYTES
 319          *
 320          * Copy enough bytes to align dst
 321          * Set match = (src and dst have same alignment)
 322          */
 323 #define match rem
 324 EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
 325         ADD     t2, zero, NBYTES
 326 EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
 327         SUB     t2, t2, t1      # t2 = number of bytes copied
 328         xor     match, t0, t1
 329 EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
 330         beq     len, t2, done
 331          SUB    len, len, t2
 332         ADD     dst, dst, t2
 333         beqz    match, both_aligned
 334          ADD    src, src, t2
 335
 336 src_unaligned_dst_aligned:
 337         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 338         PREF(   0, 3*32(src) )
 339         beqz    t0, cleanup_src_unaligned
 340          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 341         PREF(   1, 3*32(dst) )
 342 1:
 343 /*
 344  * Avoid consecutive LD*'s to the same register since some mips
 345  * implementations can't issue them in the same cycle.
 346  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 347  * are to the same unit (unless src is aligned, but it's not).
 348  */
 349 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 350 EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 351         SUB     len, len, 4*NBYTES
 352 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 353 EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 354 EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 355 EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 356 EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 357 EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 358         PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 359         ADD     src, src, 4*NBYTES
 360 #ifdef CONFIG_CPU_SB1
 361         nop                             # improves slotting
 362 #endif
 363 EXC(    STORE   t0, UNIT(0)(dst),       s_exc_p4u)
 364 EXC(    STORE   t1, UNIT(1)(dst),       s_exc_p3u)
 365 EXC(    STORE   t2, UNIT(2)(dst),       s_exc_p2u)
 366 EXC(    STORE   t3, UNIT(3)(dst),       s_exc_p1u)
 367         PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 368         bne     len, rem, 1b
 369          ADD    dst, dst, 4*NBYTES
 370
 371 cleanup_src_unaligned:
 372         beqz    len, done
 373          and    rem, len, NBYTES-1  # rem = len % NBYTES
 374         beq     rem, len, copy_bytes
 375          nop
 376 1:
 377 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 378 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 379         ADD     src, src, NBYTES
 380         SUB     len, len, NBYTES
 381 EXC(    STORE   t0, 0(dst),             s_exc_p1u)
 382         bne     len, rem, 1b
 383          ADD    dst, dst, NBYTES
 384
 385 copy_bytes_checklen:
 386         beqz    len, done
 387          nop
 388 copy_bytes:
 389         /* 0 < len < NBYTES  */
 390 #define COPY_BYTE(N)                    \
 391 EXC(    lb      t0, N(src), l_exc);     \
 392         SUB     len, len, 1;            \
 393         beqz    len, done;              \
 394 EXC(     sb     t0, N(dst), s_exc_p1)
 395
 396         COPY_BYTE(0)
 397         COPY_BYTE(1)
 398 #ifdef USE_DOUBLE
 399         COPY_BYTE(2)
 400         COPY_BYTE(3)
 401         COPY_BYTE(4)
 402         COPY_BYTE(5)
 403 #endif
 404 EXC(    lb      t0, NBYTES-2(src), l_exc)
 405         SUB     len, len, 1
 406         jr      ra
 407 EXC(     sb     t0, NBYTES-2(dst), s_exc_p1)
 408 done:
 409         jr      ra
 410          nop
 411         END(memcpy)
 412
 413 l_exc_copy:
 414         /*
 415          * Copy bytes from src until faulting load address (or until a
 416          * lb faults)
 417          *
 418          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 419          * may be more than a byte beyond the last address.
 420          * Hence, the lb below may get an exception.
 421          *
 422          * Assumes src < THREAD_BUADDR($28)
 423          */
 424         LOAD    t0, TI_TASK($28)
 425          nop
 426         LOAD    t0, THREAD_BUADDR(t0)
 427 1:
 428 EXC(    lb      t1, 0(src),     l_exc)
 429         ADD     src, src, 1
 430         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 431         bne     src, t0, 1b
 432          ADD    dst, dst, 1
 433 l_exc:
 434         LOAD    t0, TI_TASK($28)
 435          nop
 436         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 437          nop
 438         SUB     len, AT, t0             # len number of uncopied bytes
 439         /*
 440          * Here's where we rely on src and dst being incremented in tandem,
 441          *   See (3) above.
 442          * dst += (fault addr - src) to put dst at first byte to clear
 443          */
 444         ADD     dst, t0                 # compute start address in a1
 445         SUB     dst, src
 446         /*
 447          * Clear len bytes starting at dst.  Can't call __bzero because it
 448          * might modify len.  An inefficient loop for these rare times...
 449          */
 450         beqz    len, done
 451          SUB    src, len, 1
 452 1:      sb      zero, 0(dst)
 453         ADD     dst, dst, 1
 454         bnez    src, 1b
 455          SUB    src, src, 1
 456         jr      ra
 457          nop
 458
 459
 460 #define SEXC(n)                         \
 461 s_exc_p ## n ## u:                      \
 462         jr      ra;                     \
 463          ADD    len, len, n*NBYTES
 464
 465 SEXC(8)
 466 SEXC(7)
 467 SEXC(6)
 468 SEXC(5)
 469 SEXC(4)
 470 SEXC(3)
 471 SEXC(2)
 472 SEXC(1)
 473
 474 s_exc_p1:
 475         jr      ra
 476          ADD    len, len, 1
 477 s_exc:
 478         jr      ra
 479          nop
 480
 481         .align  5
 482 LEAF(memmove)
 483         ADD     t0, a0, a2
 484         ADD     t1, a1, a2
 485         sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 486         sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 487         and     t0, t1
 488         beqz    t0, __memcpy
 489          move   v0, a0                          /* return value */
 490         beqz    a2, r_out
 491         END(memmove)
 492
 493         /* fall through to __rmemcpy */
 494 LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 495          sltu   t0, a1, a0
 496         beqz    t0, r_end_bytes_up              # src >= dst
 497          nop
 498         ADD     a0, a2                          # dst = dst + len
 499         ADD     a1, a2                          # src = src + len
 500
 501 r_end_bytes:
 502         lb      t0, -1(a1)
 503         SUB     a2, a2, 0x1
 504         sb      t0, -1(a0)
 505         SUB     a1, a1, 0x1
 506         bnez    a2, r_end_bytes
 507          SUB    a0, a0, 0x1
 508
 509 r_out:
 510         jr      ra
 511          move   a2, zero
 512
 513 r_end_bytes_up:
 514         lb      t0, (a1)
 515         SUB     a2, a2, 0x1
 516         sb      t0, (a0)
 517         ADD     a1, a1, 0x1
 518         bnez    a2, r_end_bytes_up
 519          ADD    a0, a0, 0x1
 520
 521         jr      ra
 522          move   a2, zero
 523         END(__rmemcpy)