arch/mips/lib/csum_partial.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Quick'n'dirty IP checksum ...
   7  *
   8  * Copyright (C) 1998, 1999 Ralf Baechle
   9  * Copyright (C) 1999 Silicon Graphics, Inc.
  10  * Copyright (C) 2007  Maciej W. Rozycki
  11  */
  12 #include <linux/errno.h>
  13 #include <asm/asm.h>
  14 #include <asm/asm-offsets.h>
  15 #include <asm/regdef.h>
  16
  17 #ifdef CONFIG_64BIT
  18 /*
  19  * As we are sharing code base with the mips32 tree (which use the o32 ABI
  20  * register definitions). We need to redefine the register definitions from
  21  * the n64 ABI register naming to the o32 ABI register naming.
  22  */
  23 #undef t0
  24 #undef t1
  25 #undef t2
  26 #undef t3
  27 #define t0      $8
  28 #define t1      $9
  29 #define t2      $10
  30 #define t3      $11
  31 #define t4      $12
  32 #define t5      $13
  33 #define t6      $14
  34 #define t7      $15
  35
  36 #define USE_DOUBLE
  37 #endif
  38
  39 #ifdef USE_DOUBLE
  40
  41 #define LOAD   ld
  42 #define LOAD32 lwu
  43 #define ADD    daddu
  44 #define NBYTES 8
  45
  46 #else
  47
  48 #define LOAD   lw
  49 #define LOAD32 lw
  50 #define ADD    addu
  51 #define NBYTES 4
  52
  53 #endif /* USE_DOUBLE */
  54
  55 #define UNIT(unit)  ((unit)*NBYTES)
  56
  57 #define ADDC(sum,reg)                                           \
  58         .set    push;                                           \
  59         .set    noat;                                           \
  60         ADD     sum, reg;                                       \
  61         sltu    v1, sum, reg;                                   \
  62         ADD     sum, v1;                                        \
  63         .set    pop
  64
  65 #define ADDC32(sum,reg)                                         \
  66         .set    push;                                           \
  67         .set    noat;                                           \
  68         addu    sum, reg;                                       \
  69         sltu    v1, sum, reg;                                   \
  70         addu    sum, v1;                                        \
  71         .set    pop
  72
  73 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
  74         LOAD    _t0, (offset + UNIT(0))(src);                   \
  75         LOAD    _t1, (offset + UNIT(1))(src);                   \
  76         LOAD    _t2, (offset + UNIT(2))(src);                   \
  77         LOAD    _t3, (offset + UNIT(3))(src);                   \
  78         ADDC(sum, _t0);                                         \
  79         ADDC(sum, _t1);                                         \
  80         ADDC(sum, _t2);                                         \
  81         ADDC(sum, _t3)
  82
  83 #ifdef USE_DOUBLE
  84 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  85         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
  86 #else
  87 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  88         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
  89         CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
  90 #endif
  91
  92 /*
  93  * a0: source address
  94  * a1: length of the area to checksum
  95  * a2: partial checksum
  96  */
  97
  98 #define src a0
  99 #define sum v0
 100
 101         .text
 102         .set    noreorder
 103         .align  5
 104 LEAF(csum_partial)
 105         move    sum, zero
 106         move    t7, zero
 107
 108         sltiu   t8, a1, 0x8
 109         bnez    t8, .Lsmall_csumcpy             /* < 8 bytes to copy */
 110          move   t2, a1
 111
 112         andi    t7, src, 0x1                    /* odd buffer? */
 113
 114 .Lhword_align:
 115         beqz    t7, .Lword_align
 116          andi   t8, src, 0x2
 117
 118         lbu     t0, (src)
 119         LONG_SUBU       a1, a1, 0x1
 120 #ifdef __MIPSEL__
 121         sll     t0, t0, 8
 122 #endif
 123         ADDC(sum, t0)
 124         PTR_ADDU        src, src, 0x1
 125         andi    t8, src, 0x2
 126
 127 .Lword_align:
 128         beqz    t8, .Ldword_align
 129          sltiu  t8, a1, 56
 130
 131         lhu     t0, (src)
 132         LONG_SUBU       a1, a1, 0x2
 133         ADDC(sum, t0)
 134         sltiu   t8, a1, 56
 135         PTR_ADDU        src, src, 0x2
 136
 137 .Ldword_align:
 138         bnez    t8, .Ldo_end_words
 139          move   t8, a1
 140
 141         andi    t8, src, 0x4
 142         beqz    t8, .Lqword_align
 143          andi   t8, src, 0x8
 144
 145         LOAD32  t0, 0x00(src)
 146         LONG_SUBU       a1, a1, 0x4
 147         ADDC(sum, t0)
 148         PTR_ADDU        src, src, 0x4
 149         andi    t8, src, 0x8
 150
 151 .Lqword_align:
 152         beqz    t8, .Loword_align
 153          andi   t8, src, 0x10
 154
 155 #ifdef USE_DOUBLE
 156         ld      t0, 0x00(src)
 157         LONG_SUBU       a1, a1, 0x8
 158         ADDC(sum, t0)
 159 #else
 160         lw      t0, 0x00(src)
 161         lw      t1, 0x04(src)
 162         LONG_SUBU       a1, a1, 0x8
 163         ADDC(sum, t0)
 164         ADDC(sum, t1)
 165 #endif
 166         PTR_ADDU        src, src, 0x8
 167         andi    t8, src, 0x10
 168
 169 .Loword_align:
 170         beqz    t8, .Lbegin_movement
 171          LONG_SRL       t8, a1, 0x7
 172
 173 #ifdef USE_DOUBLE
 174         ld      t0, 0x00(src)
 175         ld      t1, 0x08(src)
 176         ADDC(sum, t0)
 177         ADDC(sum, t1)
 178 #else
 179         CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
 180 #endif
 181         LONG_SUBU       a1, a1, 0x10
 182         PTR_ADDU        src, src, 0x10
 183         LONG_SRL        t8, a1, 0x7
 184
 185 .Lbegin_movement:
 186         beqz    t8, 1f
 187          andi   t2, a1, 0x40
 188
 189 .Lmove_128bytes:
 190         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 191         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 192         CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 193         CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 194         LONG_SUBU       t8, t8, 0x01
 195         .set    reorder                         /* DADDI_WAR */
 196         PTR_ADDU        src, src, 0x80
 197         bnez    t8, .Lmove_128bytes
 198         .set    noreorder
 199
 200 1:
 201         beqz    t2, 1f
 202          andi   t2, a1, 0x20
 203
 204 .Lmove_64bytes:
 205         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 206         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 207         PTR_ADDU        src, src, 0x40
 208
 209 1:
 210         beqz    t2, .Ldo_end_words
 211          andi   t8, a1, 0x1c
 212
 213 .Lmove_32bytes:
 214         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 215         andi    t8, a1, 0x1c
 216         PTR_ADDU        src, src, 0x20
 217
 218 .Ldo_end_words:
 219         beqz    t8, .Lsmall_csumcpy
 220          andi   t2, a1, 0x3
 221         LONG_SRL        t8, t8, 0x2
 222
 223 .Lend_words:
 224         LOAD32  t0, (src)
 225         LONG_SUBU       t8, t8, 0x1
 226         ADDC(sum, t0)
 227         .set    reorder                         /* DADDI_WAR */
 228         PTR_ADDU        src, src, 0x4
 229         bnez    t8, .Lend_words
 230         .set    noreorder
 231
 232 /* unknown src alignment and < 8 bytes to go  */
 233 .Lsmall_csumcpy:
 234         move    a1, t2
 235
 236         andi    t0, a1, 4
 237         beqz    t0, 1f
 238          andi   t0, a1, 2
 239
 240         /* Still a full word to go  */
 241         ulw     t1, (src)
 242         PTR_ADDIU       src, 4
 243 #ifdef USE_DOUBLE
 244         dsll    t1, t1, 32                      /* clear lower 32bit */
 245 #endif
 246         ADDC(sum, t1)
 247
 248 1:      move    t1, zero
 249         beqz    t0, 1f
 250          andi   t0, a1, 1
 251
 252         /* Still a halfword to go  */
 253         ulhu    t1, (src)
 254         PTR_ADDIU       src, 2
 255
 256 1:      beqz    t0, 1f
 257          sll    t1, t1, 16
 258
 259         lbu     t2, (src)
 260          nop
 261
 262 #ifdef __MIPSEB__
 263         sll     t2, t2, 8
 264 #endif
 265         or      t1, t2
 266
 267 1:      ADDC(sum, t1)
 268
 269         /* fold checksum */
 270         .set    push
 271         .set    noat
 272 #ifdef USE_DOUBLE
 273         dsll32  v1, sum, 0
 274         daddu   sum, v1
 275         sltu    v1, sum, v1
 276         dsra32  sum, sum, 0
 277         addu    sum, v1
 278 #endif
 279
 280         /* odd buffer alignment? */
 281         beqz    t7, 1f
 282          nop
 283         sll     v1, sum, 8
 284         srl     sum, sum, 8
 285         or      sum, v1
 286         andi    sum, 0xffff
 287         .set    pop
 288 1:
 289         .set    reorder
 290         /* Add the passed partial csum.  */
 291         ADDC32(sum, a2)
 292         jr      ra
 293         .set    noreorder
 294         END(csum_partial)
 295
 296
 297 /*
 298  * checksum and copy routines based on memcpy.S
 299  *
 300  *      csum_partial_copy_nocheck(src, dst, len, sum)
 301  *      __csum_partial_copy_user(src, dst, len, sum, errp)
 302  *
 303  * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 304  * function in this file use the standard calling convention.
 305  */
 306
 307 #define src a0
 308 #define dst a1
 309 #define len a2
 310 #define psum a3
 311 #define sum v0
 312 #define odd t8
 313 #define errptr t9
 314
 315 /*
 316  * The exception handler for loads requires that:
 317  *  1- AT contain the address of the byte just past the end of the source
 318  *     of the copy,
 319  *  2- src_entry <= src < AT, and
 320  *  3- (dst - src) == (dst_entry - src_entry),
 321  * The _entry suffix denotes values when __copy_user was called.
 322  *
 323  * (1) is set up up by __csum_partial_copy_from_user and maintained by
 324  *      not writing AT in __csum_partial_copy
 325  * (2) is met by incrementing src by the number of bytes copied
 326  * (3) is met by not doing loads between a pair of increments of dst and src
 327  *
 328  * The exception handlers for stores stores -EFAULT to errptr and return.
 329  * These handlers do not need to overwrite any data.
 330  */
 331
 332 #define EXC(inst_reg,addr,handler)              \
 333 9:      inst_reg, addr;                         \
 334         .section __ex_table,"a";                \
 335         PTR     9b, handler;                    \
 336         .previous
 337
 338 #ifdef USE_DOUBLE
 339
 340 #define LOAD   ld
 341 #define LOADL  ldl
 342 #define LOADR  ldr
 343 #define STOREL sdl
 344 #define STORER sdr
 345 #define STORE  sd
 346 #define ADD    daddu
 347 #define SUB    dsubu
 348 #define SRL    dsrl
 349 #define SLL    dsll
 350 #define SLLV   dsllv
 351 #define SRLV   dsrlv
 352 #define NBYTES 8
 353 #define LOG_NBYTES 3
 354
 355 #else
 356
 357 #define LOAD   lw
 358 #define LOADL  lwl
 359 #define LOADR  lwr
 360 #define STOREL swl
 361 #define STORER swr
 362 #define STORE  sw
 363 #define ADD    addu
 364 #define SUB    subu
 365 #define SRL    srl
 366 #define SLL    sll
 367 #define SLLV   sllv
 368 #define SRLV   srlv
 369 #define NBYTES 4
 370 #define LOG_NBYTES 2
 371
 372 #endif /* USE_DOUBLE */
 373
 374 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 375 #define LDFIRST LOADR
 376 #define LDREST  LOADL
 377 #define STFIRST STORER
 378 #define STREST  STOREL
 379 #define SHIFT_DISCARD SLLV
 380 #define SHIFT_DISCARD_REVERT SRLV
 381 #else
 382 #define LDFIRST LOADL
 383 #define LDREST  LOADR
 384 #define STFIRST STOREL
 385 #define STREST  STORER
 386 #define SHIFT_DISCARD SRLV
 387 #define SHIFT_DISCARD_REVERT SLLV
 388 #endif
 389
 390 #define FIRST(unit) ((unit)*NBYTES)
 391 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 392
 393 #define ADDRMASK (NBYTES-1)
 394
 395 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 396         .set    noat
 397 #else
 398         .set    at=v1
 399 #endif
 400
 401 LEAF(__csum_partial_copy_user)
 402         PTR_ADDU        AT, src, len    /* See (1) above. */
 403 #ifdef CONFIG_64BIT
 404         move    errptr, a4
 405 #else
 406         lw      errptr, 16(sp)
 407 #endif
 408 FEXPORT(csum_partial_copy_nocheck)
 409         move    sum, zero
 410         move    odd, zero
 411         /*
 412          * Note: dst & src may be unaligned, len may be 0
 413          * Temps
 414          */
 415         /*
 416          * The "issue break"s below are very approximate.
 417          * Issue delays for dcache fills will perturb the schedule, as will
 418          * load queue full replay traps, etc.
 419          *
 420          * If len < NBYTES use byte operations.
 421          */
 422         sltu    t2, len, NBYTES
 423         and     t1, dst, ADDRMASK
 424         bnez    t2, .Lcopy_bytes_checklen
 425          and    t0, src, ADDRMASK
 426         andi    odd, dst, 0x1                   /* odd buffer? */
 427         bnez    t1, .Ldst_unaligned
 428          nop
 429         bnez    t0, .Lsrc_unaligned_dst_aligned
 430         /*
 431          * use delay slot for fall-through
 432          * src and dst are aligned; need to compute rem
 433          */
 434 .Lboth_aligned:
 435          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 436         beqz    t0, .Lcleanup_both_aligned # len < 8*NBYTES
 437          nop
 438         SUB     len, 8*NBYTES           # subtract here for bgez loop
 439         .align  4
 440 1:
 441 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 442 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 443 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 444 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 445 EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 446 EXC(    LOAD    t5, UNIT(5)(src),       .Ll_exc_copy)
 447 EXC(    LOAD    t6, UNIT(6)(src),       .Ll_exc_copy)
 448 EXC(    LOAD    t7, UNIT(7)(src),       .Ll_exc_copy)
 449         SUB     len, len, 8*NBYTES
 450         ADD     src, src, 8*NBYTES
 451 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 452         ADDC(sum, t0)
 453 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 454         ADDC(sum, t1)
 455 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 456         ADDC(sum, t2)
 457 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 458         ADDC(sum, t3)
 459 EXC(    STORE   t4, UNIT(4)(dst),       .Ls_exc)
 460         ADDC(sum, t4)
 461 EXC(    STORE   t5, UNIT(5)(dst),       .Ls_exc)
 462         ADDC(sum, t5)
 463 EXC(    STORE   t6, UNIT(6)(dst),       .Ls_exc)
 464         ADDC(sum, t6)
 465 EXC(    STORE   t7, UNIT(7)(dst),       .Ls_exc)
 466         ADDC(sum, t7)
 467         .set    reorder                         /* DADDI_WAR */
 468         ADD     dst, dst, 8*NBYTES
 469         bgez    len, 1b
 470         .set    noreorder
 471         ADD     len, 8*NBYTES           # revert len (see above)
 472
 473         /*
 474          * len == the number of bytes left to copy < 8*NBYTES
 475          */
 476 .Lcleanup_both_aligned:
 477 #define rem t7
 478         beqz    len, .Ldone
 479          sltu   t0, len, 4*NBYTES
 480         bnez    t0, .Lless_than_4units
 481          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 482         /*
 483          * len >= 4*NBYTES
 484          */
 485 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 486 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 487 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 488 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 489         SUB     len, len, 4*NBYTES
 490         ADD     src, src, 4*NBYTES
 491 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 492         ADDC(sum, t0)
 493 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 494         ADDC(sum, t1)
 495 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 496         ADDC(sum, t2)
 497 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 498         ADDC(sum, t3)
 499         .set    reorder                         /* DADDI_WAR */
 500         ADD     dst, dst, 4*NBYTES
 501         beqz    len, .Ldone
 502         .set    noreorder
 503 .Lless_than_4units:
 504         /*
 505          * rem = len % NBYTES
 506          */
 507         beq     rem, len, .Lcopy_bytes
 508          nop
 509 1:
 510 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 511         ADD     src, src, NBYTES
 512         SUB     len, len, NBYTES
 513 EXC(    STORE   t0, 0(dst),             .Ls_exc)
 514         ADDC(sum, t0)
 515         .set    reorder                         /* DADDI_WAR */
 516         ADD     dst, dst, NBYTES
 517         bne     rem, len, 1b
 518         .set    noreorder
 519
 520         /*
 521          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 522          * A loop would do only a byte at a time with possible branch
 523          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 524          * because can't assume read-access to dst.  Instead, use
 525          * STREST dst, which doesn't require read access to dst.
 526          *
 527          * This code should perform better than a simple loop on modern,
 528          * wide-issue mips processors because the code has fewer branches and
 529          * more instruction-level parallelism.
 530          */
 531 #define bits t2
 532         beqz    len, .Ldone
 533          ADD    t1, dst, len    # t1 is just past last byte of dst
 534         li      bits, 8*NBYTES
 535         SLL     rem, len, 3     # rem = number of bits to keep
 536 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 537         SUB     bits, bits, rem # bits = number of bits to discard
 538         SHIFT_DISCARD t0, t0, bits
 539 EXC(    STREST  t0, -1(t1),             .Ls_exc)
 540         SHIFT_DISCARD_REVERT t0, t0, bits
 541         .set reorder
 542         ADDC(sum, t0)
 543         b       .Ldone
 544         .set noreorder
 545 .Ldst_unaligned:
 546         /*
 547          * dst is unaligned
 548          * t0 = src & ADDRMASK
 549          * t1 = dst & ADDRMASK; T1 > 0
 550          * len >= NBYTES
 551          *
 552          * Copy enough bytes to align dst
 553          * Set match = (src and dst have same alignment)
 554          */
 555 #define match rem
 556 EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 557         ADD     t2, zero, NBYTES
 558 EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 559         SUB     t2, t2, t1      # t2 = number of bytes copied
 560         xor     match, t0, t1
 561 EXC(    STFIRST t3, FIRST(0)(dst),      .Ls_exc)
 562         SLL     t4, t1, 3               # t4 = number of bits to discard
 563         SHIFT_DISCARD t3, t3, t4
 564         /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
 565         ADDC(sum, t3)
 566         beq     len, t2, .Ldone
 567          SUB    len, len, t2
 568         ADD     dst, dst, t2
 569         beqz    match, .Lboth_aligned
 570          ADD    src, src, t2
 571
 572 .Lsrc_unaligned_dst_aligned:
 573         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 574         beqz    t0, .Lcleanup_src_unaligned
 575          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 576 1:
 577 /*
 578  * Avoid consecutive LD*'s to the same register since some mips
 579  * implementations can't issue them in the same cycle.
 580  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 581  * are to the same unit (unless src is aligned, but it's not).
 582  */
 583 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 584 EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 585         SUB     len, len, 4*NBYTES
 586 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 587 EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 588 EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 589 EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 590 EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 591 EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 592         ADD     src, src, 4*NBYTES
 593 #ifdef CONFIG_CPU_SB1
 594         nop                             # improves slotting
 595 #endif
 596 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc)
 597         ADDC(sum, t0)
 598 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc)
 599         ADDC(sum, t1)
 600 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc)
 601         ADDC(sum, t2)
 602 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc)
 603         ADDC(sum, t3)
 604         .set    reorder                         /* DADDI_WAR */
 605         ADD     dst, dst, 4*NBYTES
 606         bne     len, rem, 1b
 607         .set    noreorder
 608
 609 .Lcleanup_src_unaligned:
 610         beqz    len, .Ldone
 611          and    rem, len, NBYTES-1  # rem = len % NBYTES
 612         beq     rem, len, .Lcopy_bytes
 613          nop
 614 1:
 615 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 616 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 617         ADD     src, src, NBYTES
 618         SUB     len, len, NBYTES
 619 EXC(    STORE   t0, 0(dst),             .Ls_exc)
 620         ADDC(sum, t0)
 621         .set    reorder                         /* DADDI_WAR */
 622         ADD     dst, dst, NBYTES
 623         bne     len, rem, 1b
 624         .set    noreorder
 625
 626 .Lcopy_bytes_checklen:
 627         beqz    len, .Ldone
 628          nop
 629 .Lcopy_bytes:
 630         /* 0 < len < NBYTES  */
 631 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 632 #define SHIFT_START 0
 633 #define SHIFT_INC 8
 634 #else
 635 #define SHIFT_START 8*(NBYTES-1)
 636 #define SHIFT_INC -8
 637 #endif
 638         move    t2, zero        # partial word
 639         li      t3, SHIFT_START # shift
 640 /* use .Ll_exc_copy here to return correct sum on fault */
 641 #define COPY_BYTE(N)                    \
 642 EXC(    lbu     t0, N(src), .Ll_exc_copy);      \
 643         SUB     len, len, 1;            \
 644 EXC(    sb      t0, N(dst), .Ls_exc);   \
 645         SLLV    t0, t0, t3;             \
 646         addu    t3, SHIFT_INC;          \
 647         beqz    len, .Lcopy_bytes_done; \
 648          or     t2, t0
 649
 650         COPY_BYTE(0)
 651         COPY_BYTE(1)
 652 #ifdef USE_DOUBLE
 653         COPY_BYTE(2)
 654         COPY_BYTE(3)
 655         COPY_BYTE(4)
 656         COPY_BYTE(5)
 657 #endif
 658 EXC(    lbu     t0, NBYTES-2(src), .Ll_exc_copy)
 659         SUB     len, len, 1
 660 EXC(    sb      t0, NBYTES-2(dst), .Ls_exc)
 661         SLLV    t0, t0, t3
 662         or      t2, t0
 663 .Lcopy_bytes_done:
 664         ADDC(sum, t2)
 665 .Ldone:
 666         /* fold checksum */
 667         .set    push
 668         .set    noat
 669 #ifdef USE_DOUBLE
 670         dsll32  v1, sum, 0
 671         daddu   sum, v1
 672         sltu    v1, sum, v1
 673         dsra32  sum, sum, 0
 674         addu    sum, v1
 675 #endif
 676
 677         /* odd buffer alignment? */
 678         beqz    odd, 1f
 679          nop
 680         sll     v1, sum, 8
 681         srl     sum, sum, 8
 682         or      sum, v1
 683         andi    sum, 0xffff
 684         .set    pop
 685 1:
 686         .set reorder
 687         ADDC32(sum, psum)
 688         jr      ra
 689         .set noreorder
 690
 691 .Ll_exc_copy:
 692         /*
 693          * Copy bytes from src until faulting load address (or until a
 694          * lb faults)
 695          *
 696          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 697          * may be more than a byte beyond the last address.
 698          * Hence, the lb below may get an exception.
 699          *
 700          * Assumes src < THREAD_BUADDR($28)
 701          */
 702         LOAD    t0, TI_TASK($28)
 703          li     t2, SHIFT_START
 704         LOAD    t0, THREAD_BUADDR(t0)
 705 1:
 706 EXC(    lbu     t1, 0(src),     .Ll_exc)
 707         ADD     src, src, 1
 708         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 709         SLLV    t1, t1, t2
 710         addu    t2, SHIFT_INC
 711         ADDC(sum, t1)
 712         .set    reorder                         /* DADDI_WAR */
 713         ADD     dst, dst, 1
 714         bne     src, t0, 1b
 715         .set    noreorder
 716 .Ll_exc:
 717         LOAD    t0, TI_TASK($28)
 718          nop
 719         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 720          nop
 721         SUB     len, AT, t0             # len number of uncopied bytes
 722         /*
 723          * Here's where we rely on src and dst being incremented in tandem,
 724          *   See (3) above.
 725          * dst += (fault addr - src) to put dst at first byte to clear
 726          */
 727         ADD     dst, t0                 # compute start address in a1
 728         SUB     dst, src
 729         /*
 730          * Clear len bytes starting at dst.  Can't call __bzero because it
 731          * might modify len.  An inefficient loop for these rare times...
 732          */
 733         .set    reorder                         /* DADDI_WAR */
 734         SUB     src, len, 1
 735         beqz    len, .Ldone
 736         .set    noreorder
 737 1:      sb      zero, 0(dst)
 738         ADD     dst, dst, 1
 739         .set    push
 740         .set    noat
 741 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 742         bnez    src, 1b
 743          SUB    src, src, 1
 744 #else
 745         li      v1, 1
 746         bnez    src, 1b
 747          SUB    src, src, v1
 748 #endif
 749         li      v1, -EFAULT
 750         b       .Ldone
 751          sw     v1, (errptr)
 752
 753 .Ls_exc:
 754         li      v0, -1 /* invalid checksum */
 755         li      v1, -EFAULT
 756         jr      ra
 757          sw     v1, (errptr)
 758         .set    pop
 759         END(__csum_partial_copy_user)