powerpc: Use enhanced touch instructions in POWER7 copy_to_user/copy_from_user
[pandora-kernel.git] / arch / powerpc / lib / copyuser_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2011
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 #define STACKFRAMESIZE  256
23 #define STK_REG(i)      (112 + ((i)-14)*8)
24
25         .macro err1
26 100:
27         .section __ex_table,"a"
28         .align 3
29         .llong 100b,.Ldo_err1
30         .previous
31         .endm
32
33         .macro err2
34 200:
35         .section __ex_table,"a"
36         .align 3
37         .llong 200b,.Ldo_err2
38         .previous
39         .endm
40
41 #ifdef CONFIG_ALTIVEC
42         .macro err3
43 300:
44         .section __ex_table,"a"
45         .align 3
46         .llong 300b,.Ldo_err3
47         .previous
48         .endm
49
50         .macro err4
51 400:
52         .section __ex_table,"a"
53         .align 3
54         .llong 400b,.Ldo_err4
55         .previous
56         .endm
57
58
59 .Ldo_err4:
60         ld      r16,STK_REG(r16)(r1)
61         ld      r15,STK_REG(r15)(r1)
62         ld      r14,STK_REG(r14)(r1)
63 .Ldo_err3:
64         bl      .exit_vmx_usercopy
65         ld      r0,STACKFRAMESIZE+16(r1)
66         mtlr    r0
67         b       .Lexit
68 #endif /* CONFIG_ALTIVEC */
69
70 .Ldo_err2:
71         ld      r22,STK_REG(r22)(r1)
72         ld      r21,STK_REG(r21)(r1)
73         ld      r20,STK_REG(r20)(r1)
74         ld      r19,STK_REG(r19)(r1)
75         ld      r18,STK_REG(r18)(r1)
76         ld      r17,STK_REG(r17)(r1)
77         ld      r16,STK_REG(r16)(r1)
78         ld      r15,STK_REG(r15)(r1)
79         ld      r14,STK_REG(r14)(r1)
80 .Lexit:
81         addi    r1,r1,STACKFRAMESIZE
82 .Ldo_err1:
83         ld      r3,48(r1)
84         ld      r4,56(r1)
85         ld      r5,64(r1)
86         b       __copy_tofrom_user_base
87
88
89 _GLOBAL(__copy_tofrom_user_power7)
90 #ifdef CONFIG_ALTIVEC
91         cmpldi  r5,16
92         cmpldi  cr1,r5,4096
93
94         std     r3,48(r1)
95         std     r4,56(r1)
96         std     r5,64(r1)
97
98         blt     .Lshort_copy
99         bgt     cr1,.Lvmx_copy
100 #else
101         cmpldi  r5,16
102
103         std     r3,48(r1)
104         std     r4,56(r1)
105         std     r5,64(r1)
106
107         blt     .Lshort_copy
108 #endif
109
110 .Lnonvmx_copy:
111         /* Get the source 8B aligned */
112         neg     r6,r4
113         mtocrf  0x01,r6
114         clrldi  r6,r6,(64-3)
115
116         bf      cr7*4+3,1f
117 err1;   lbz     r0,0(r4)
118         addi    r4,r4,1
119 err1;   stb     r0,0(r3)
120         addi    r3,r3,1
121
122 1:      bf      cr7*4+2,2f
123 err1;   lhz     r0,0(r4)
124         addi    r4,r4,2
125 err1;   sth     r0,0(r3)
126         addi    r3,r3,2
127
128 2:      bf      cr7*4+1,3f
129 err1;   lwz     r0,0(r4)
130         addi    r4,r4,4
131 err1;   stw     r0,0(r3)
132         addi    r3,r3,4
133
134 3:      sub     r5,r5,r6
135         cmpldi  r5,128
136         blt     5f
137
138         mflr    r0
139         stdu    r1,-STACKFRAMESIZE(r1)
140         std     r14,STK_REG(r14)(r1)
141         std     r15,STK_REG(r15)(r1)
142         std     r16,STK_REG(r16)(r1)
143         std     r17,STK_REG(r17)(r1)
144         std     r18,STK_REG(r18)(r1)
145         std     r19,STK_REG(r19)(r1)
146         std     r20,STK_REG(r20)(r1)
147         std     r21,STK_REG(r21)(r1)
148         std     r22,STK_REG(r22)(r1)
149         std     r0,STACKFRAMESIZE+16(r1)
150
151         srdi    r6,r5,7
152         mtctr   r6
153
154         /* Now do cacheline (128B) sized loads and stores. */
155         .align  5
156 4:
157 err2;   ld      r0,0(r4)
158 err2;   ld      r6,8(r4)
159 err2;   ld      r7,16(r4)
160 err2;   ld      r8,24(r4)
161 err2;   ld      r9,32(r4)
162 err2;   ld      r10,40(r4)
163 err2;   ld      r11,48(r4)
164 err2;   ld      r12,56(r4)
165 err2;   ld      r14,64(r4)
166 err2;   ld      r15,72(r4)
167 err2;   ld      r16,80(r4)
168 err2;   ld      r17,88(r4)
169 err2;   ld      r18,96(r4)
170 err2;   ld      r19,104(r4)
171 err2;   ld      r20,112(r4)
172 err2;   ld      r21,120(r4)
173         addi    r4,r4,128
174 err2;   std     r0,0(r3)
175 err2;   std     r6,8(r3)
176 err2;   std     r7,16(r3)
177 err2;   std     r8,24(r3)
178 err2;   std     r9,32(r3)
179 err2;   std     r10,40(r3)
180 err2;   std     r11,48(r3)
181 err2;   std     r12,56(r3)
182 err2;   std     r14,64(r3)
183 err2;   std     r15,72(r3)
184 err2;   std     r16,80(r3)
185 err2;   std     r17,88(r3)
186 err2;   std     r18,96(r3)
187 err2;   std     r19,104(r3)
188 err2;   std     r20,112(r3)
189 err2;   std     r21,120(r3)
190         addi    r3,r3,128
191         bdnz    4b
192
193         clrldi  r5,r5,(64-7)
194
195         ld      r14,STK_REG(r14)(r1)
196         ld      r15,STK_REG(r15)(r1)
197         ld      r16,STK_REG(r16)(r1)
198         ld      r17,STK_REG(r17)(r1)
199         ld      r18,STK_REG(r18)(r1)
200         ld      r19,STK_REG(r19)(r1)
201         ld      r20,STK_REG(r20)(r1)
202         ld      r21,STK_REG(r21)(r1)
203         ld      r22,STK_REG(r22)(r1)
204         addi    r1,r1,STACKFRAMESIZE
205
206         /* Up to 127B to go */
207 5:      srdi    r6,r5,4
208         mtocrf  0x01,r6
209
210 6:      bf      cr7*4+1,7f
211 err1;   ld      r0,0(r4)
212 err1;   ld      r6,8(r4)
213 err1;   ld      r7,16(r4)
214 err1;   ld      r8,24(r4)
215 err1;   ld      r9,32(r4)
216 err1;   ld      r10,40(r4)
217 err1;   ld      r11,48(r4)
218 err1;   ld      r12,56(r4)
219         addi    r4,r4,64
220 err1;   std     r0,0(r3)
221 err1;   std     r6,8(r3)
222 err1;   std     r7,16(r3)
223 err1;   std     r8,24(r3)
224 err1;   std     r9,32(r3)
225 err1;   std     r10,40(r3)
226 err1;   std     r11,48(r3)
227 err1;   std     r12,56(r3)
228         addi    r3,r3,64
229
230         /* Up to 63B to go */
231 7:      bf      cr7*4+2,8f
232 err1;   ld      r0,0(r4)
233 err1;   ld      r6,8(r4)
234 err1;   ld      r7,16(r4)
235 err1;   ld      r8,24(r4)
236         addi    r4,r4,32
237 err1;   std     r0,0(r3)
238 err1;   std     r6,8(r3)
239 err1;   std     r7,16(r3)
240 err1;   std     r8,24(r3)
241         addi    r3,r3,32
242
243         /* Up to 31B to go */
244 8:      bf      cr7*4+3,9f
245 err1;   ld      r0,0(r4)
246 err1;   ld      r6,8(r4)
247         addi    r4,r4,16
248 err1;   std     r0,0(r3)
249 err1;   std     r6,8(r3)
250         addi    r3,r3,16
251
252 9:      clrldi  r5,r5,(64-4)
253
254         /* Up to 15B to go */
255 .Lshort_copy:
256         mtocrf  0x01,r5
257         bf      cr7*4+0,12f
258 err1;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
259 err1;   lwz     r6,4(r4)
260         addi    r4,r4,8
261 err1;   stw     r0,0(r3)
262 err1;   stw     r6,4(r3)
263         addi    r3,r3,8
264
265 12:     bf      cr7*4+1,13f
266 err1;   lwz     r0,0(r4)
267         addi    r4,r4,4
268 err1;   stw     r0,0(r3)
269         addi    r3,r3,4
270
271 13:     bf      cr7*4+2,14f
272 err1;   lhz     r0,0(r4)
273         addi    r4,r4,2
274 err1;   sth     r0,0(r3)
275         addi    r3,r3,2
276
277 14:     bf      cr7*4+3,15f
278 err1;   lbz     r0,0(r4)
279 err1;   stb     r0,0(r3)
280
281 15:     li      r3,0
282         blr
283
284 .Lunwind_stack_nonvmx_copy:
285         addi    r1,r1,STACKFRAMESIZE
286         b       .Lnonvmx_copy
287
288 #ifdef CONFIG_ALTIVEC
289 .Lvmx_copy:
290         mflr    r0
291         std     r0,16(r1)
292         stdu    r1,-STACKFRAMESIZE(r1)
293         bl      .enter_vmx_usercopy
294         cmpwi   r3,0
295         ld      r0,STACKFRAMESIZE+16(r1)
296         ld      r3,STACKFRAMESIZE+48(r1)
297         ld      r4,STACKFRAMESIZE+56(r1)
298         ld      r5,STACKFRAMESIZE+64(r1)
299         mtlr    r0
300
301         /*
302          * We prefetch both the source and destination using enhanced touch
303          * instructions. We use a stream ID of 0 for the load side and
304          * 1 for the store side.
305          */
306         clrrdi  r6,r4,7
307         clrrdi  r9,r3,7
308         ori     r9,r9,1         /* stream=1 */
309
310         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
311         cmpldi  r7,0x3FF
312         ble     1f
313         li      r7,0x3FF
314 1:      lis     r0,0x0E00       /* depth=7 */
315         sldi    r7,r7,7
316         or      r7,r7,r0
317         ori     r10,r7,1        /* stream=1 */
318
319         lis     r8,0x8000       /* GO=1 */
320         clrldi  r8,r8,32
321
322 .machine push
323 .machine "power4"
324         dcbt    r0,r6,0b01000
325         dcbt    r0,r7,0b01010
326         dcbtst  r0,r9,0b01000
327         dcbtst  r0,r10,0b01010
328         eieio
329         dcbt    r0,r8,0b01010   /* GO */
330 .machine pop
331
332         /*
333          * We prefetch both the source and destination using enhanced touch
334          * instructions. We use a stream ID of 0 for the load side and
335          * 1 for the store side.
336          */
337         clrrdi  r6,r4,7
338         clrrdi  r9,r3,7
339         ori     r9,r9,1         /* stream=1 */
340
341         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
342         cmpldi  cr1,r7,0x3FF
343         ble     cr1,1f
344         li      r7,0x3FF
345 1:      lis     r0,0x0E00       /* depth=7 */
346         sldi    r7,r7,7
347         or      r7,r7,r0
348         ori     r10,r7,1        /* stream=1 */
349
350         lis     r8,0x8000       /* GO=1 */
351         clrldi  r8,r8,32
352
353 .machine push
354 .machine "power4"
355         dcbt    r0,r6,0b01000
356         dcbt    r0,r7,0b01010
357         dcbtst  r0,r9,0b01000
358         dcbtst  r0,r10,0b01010
359         eieio
360         dcbt    r0,r8,0b01010   /* GO */
361 .machine pop
362
363         beq     .Lunwind_stack_nonvmx_copy
364
365         /*
366          * If source and destination are not relatively aligned we use a
367          * slower permute loop.
368          */
369         xor     r6,r4,r3
370         rldicl. r6,r6,0,(64-4)
371         bne     .Lvmx_unaligned_copy
372
373         /* Get the destination 16B aligned */
374         neg     r6,r3
375         mtocrf  0x01,r6
376         clrldi  r6,r6,(64-4)
377
378         bf      cr7*4+3,1f
379 err3;   lbz     r0,0(r4)
380         addi    r4,r4,1
381 err3;   stb     r0,0(r3)
382         addi    r3,r3,1
383
384 1:      bf      cr7*4+2,2f
385 err3;   lhz     r0,0(r4)
386         addi    r4,r4,2
387 err3;   sth     r0,0(r3)
388         addi    r3,r3,2
389
390 2:      bf      cr7*4+1,3f
391 err3;   lwz     r0,0(r4)
392         addi    r4,r4,4
393 err3;   stw     r0,0(r3)
394         addi    r3,r3,4
395
396 3:      bf      cr7*4+0,4f
397 err3;   ld      r0,0(r4)
398         addi    r4,r4,8
399 err3;   std     r0,0(r3)
400         addi    r3,r3,8
401
402 4:      sub     r5,r5,r6
403
404         /* Get the desination 128B aligned */
405         neg     r6,r3
406         srdi    r7,r6,4
407         mtocrf  0x01,r7
408         clrldi  r6,r6,(64-7)
409
410         li      r9,16
411         li      r10,32
412         li      r11,48
413
414         bf      cr7*4+3,5f
415 err3;   lvx     vr1,r0,r4
416         addi    r4,r4,16
417 err3;   stvx    vr1,r0,r3
418         addi    r3,r3,16
419
420 5:      bf      cr7*4+2,6f
421 err3;   lvx     vr1,r0,r4
422 err3;   lvx     vr0,r4,r9
423         addi    r4,r4,32
424 err3;   stvx    vr1,r0,r3
425 err3;   stvx    vr0,r3,r9
426         addi    r3,r3,32
427
428 6:      bf      cr7*4+1,7f
429 err3;   lvx     vr3,r0,r4
430 err3;   lvx     vr2,r4,r9
431 err3;   lvx     vr1,r4,r10
432 err3;   lvx     vr0,r4,r11
433         addi    r4,r4,64
434 err3;   stvx    vr3,r0,r3
435 err3;   stvx    vr2,r3,r9
436 err3;   stvx    vr1,r3,r10
437 err3;   stvx    vr0,r3,r11
438         addi    r3,r3,64
439
440 7:      sub     r5,r5,r6
441         srdi    r6,r5,7
442
443         std     r14,STK_REG(r14)(r1)
444         std     r15,STK_REG(r15)(r1)
445         std     r16,STK_REG(r16)(r1)
446
447         li      r12,64
448         li      r14,80
449         li      r15,96
450         li      r16,112
451
452         mtctr   r6
453
454         /*
455          * Now do cacheline sized loads and stores. By this stage the
456          * cacheline stores are also cacheline aligned.
457          */
458         .align  5
459 8:
460 err4;   lvx     vr7,r0,r4
461 err4;   lvx     vr6,r4,r9
462 err4;   lvx     vr5,r4,r10
463 err4;   lvx     vr4,r4,r11
464 err4;   lvx     vr3,r4,r12
465 err4;   lvx     vr2,r4,r14
466 err4;   lvx     vr1,r4,r15
467 err4;   lvx     vr0,r4,r16
468         addi    r4,r4,128
469 err4;   stvx    vr7,r0,r3
470 err4;   stvx    vr6,r3,r9
471 err4;   stvx    vr5,r3,r10
472 err4;   stvx    vr4,r3,r11
473 err4;   stvx    vr3,r3,r12
474 err4;   stvx    vr2,r3,r14
475 err4;   stvx    vr1,r3,r15
476 err4;   stvx    vr0,r3,r16
477         addi    r3,r3,128
478         bdnz    8b
479
480         ld      r14,STK_REG(r14)(r1)
481         ld      r15,STK_REG(r15)(r1)
482         ld      r16,STK_REG(r16)(r1)
483
484         /* Up to 127B to go */
485         clrldi  r5,r5,(64-7)
486         srdi    r6,r5,4
487         mtocrf  0x01,r6
488
489         bf      cr7*4+1,9f
490 err3;   lvx     vr3,r0,r4
491 err3;   lvx     vr2,r4,r9
492 err3;   lvx     vr1,r4,r10
493 err3;   lvx     vr0,r4,r11
494         addi    r4,r4,64
495 err3;   stvx    vr3,r0,r3
496 err3;   stvx    vr2,r3,r9
497 err3;   stvx    vr1,r3,r10
498 err3;   stvx    vr0,r3,r11
499         addi    r3,r3,64
500
501 9:      bf      cr7*4+2,10f
502 err3;   lvx     vr1,r0,r4
503 err3;   lvx     vr0,r4,r9
504         addi    r4,r4,32
505 err3;   stvx    vr1,r0,r3
506 err3;   stvx    vr0,r3,r9
507         addi    r3,r3,32
508
509 10:     bf      cr7*4+3,11f
510 err3;   lvx     vr1,r0,r4
511         addi    r4,r4,16
512 err3;   stvx    vr1,r0,r3
513         addi    r3,r3,16
514
515         /* Up to 15B to go */
516 11:     clrldi  r5,r5,(64-4)
517         mtocrf  0x01,r5
518         bf      cr7*4+0,12f
519 err3;   ld      r0,0(r4)
520         addi    r4,r4,8
521 err3;   std     r0,0(r3)
522         addi    r3,r3,8
523
524 12:     bf      cr7*4+1,13f
525 err3;   lwz     r0,0(r4)
526         addi    r4,r4,4
527 err3;   stw     r0,0(r3)
528         addi    r3,r3,4
529
530 13:     bf      cr7*4+2,14f
531 err3;   lhz     r0,0(r4)
532         addi    r4,r4,2
533 err3;   sth     r0,0(r3)
534         addi    r3,r3,2
535
536 14:     bf      cr7*4+3,15f
537 err3;   lbz     r0,0(r4)
538 err3;   stb     r0,0(r3)
539
540 15:     addi    r1,r1,STACKFRAMESIZE
541         b       .exit_vmx_usercopy      /* tail call optimise */
542
543 .Lvmx_unaligned_copy:
544         /* Get the destination 16B aligned */
545         neg     r6,r3
546         mtocrf  0x01,r6
547         clrldi  r6,r6,(64-4)
548
549         bf      cr7*4+3,1f
550 err3;   lbz     r0,0(r4)
551         addi    r4,r4,1
552 err3;   stb     r0,0(r3)
553         addi    r3,r3,1
554
555 1:      bf      cr7*4+2,2f
556 err3;   lhz     r0,0(r4)
557         addi    r4,r4,2
558 err3;   sth     r0,0(r3)
559         addi    r3,r3,2
560
561 2:      bf      cr7*4+1,3f
562 err3;   lwz     r0,0(r4)
563         addi    r4,r4,4
564 err3;   stw     r0,0(r3)
565         addi    r3,r3,4
566
567 3:      bf      cr7*4+0,4f
568 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
569 err3;   lwz     r7,4(r4)
570         addi    r4,r4,8
571 err3;   stw     r0,0(r3)
572 err3;   stw     r7,4(r3)
573         addi    r3,r3,8
574
575 4:      sub     r5,r5,r6
576
577         /* Get the desination 128B aligned */
578         neg     r6,r3
579         srdi    r7,r6,4
580         mtocrf  0x01,r7
581         clrldi  r6,r6,(64-7)
582
583         li      r9,16
584         li      r10,32
585         li      r11,48
586
587         lvsl    vr16,0,r4       /* Setup permute control vector */
588 err3;   lvx     vr0,0,r4
589         addi    r4,r4,16
590
591         bf      cr7*4+3,5f
592 err3;   lvx     vr1,r0,r4
593         vperm   vr8,vr0,vr1,vr16
594         addi    r4,r4,16
595 err3;   stvx    vr8,r0,r3
596         addi    r3,r3,16
597         vor     vr0,vr1,vr1
598
599 5:      bf      cr7*4+2,6f
600 err3;   lvx     vr1,r0,r4
601         vperm   vr8,vr0,vr1,vr16
602 err3;   lvx     vr0,r4,r9
603         vperm   vr9,vr1,vr0,vr16
604         addi    r4,r4,32
605 err3;   stvx    vr8,r0,r3
606 err3;   stvx    vr9,r3,r9
607         addi    r3,r3,32
608
609 6:      bf      cr7*4+1,7f
610 err3;   lvx     vr3,r0,r4
611         vperm   vr8,vr0,vr3,vr16
612 err3;   lvx     vr2,r4,r9
613         vperm   vr9,vr3,vr2,vr16
614 err3;   lvx     vr1,r4,r10
615         vperm   vr10,vr2,vr1,vr16
616 err3;   lvx     vr0,r4,r11
617         vperm   vr11,vr1,vr0,vr16
618         addi    r4,r4,64
619 err3;   stvx    vr8,r0,r3
620 err3;   stvx    vr9,r3,r9
621 err3;   stvx    vr10,r3,r10
622 err3;   stvx    vr11,r3,r11
623         addi    r3,r3,64
624
625 7:      sub     r5,r5,r6
626         srdi    r6,r5,7
627
628         std     r14,STK_REG(r14)(r1)
629         std     r15,STK_REG(r15)(r1)
630         std     r16,STK_REG(r16)(r1)
631
632         li      r12,64
633         li      r14,80
634         li      r15,96
635         li      r16,112
636
637         mtctr   r6
638
639         /*
640          * Now do cacheline sized loads and stores. By this stage the
641          * cacheline stores are also cacheline aligned.
642          */
643         .align  5
644 8:
645 err4;   lvx     vr7,r0,r4
646         vperm   vr8,vr0,vr7,vr16
647 err4;   lvx     vr6,r4,r9
648         vperm   vr9,vr7,vr6,vr16
649 err4;   lvx     vr5,r4,r10
650         vperm   vr10,vr6,vr5,vr16
651 err4;   lvx     vr4,r4,r11
652         vperm   vr11,vr5,vr4,vr16
653 err4;   lvx     vr3,r4,r12
654         vperm   vr12,vr4,vr3,vr16
655 err4;   lvx     vr2,r4,r14
656         vperm   vr13,vr3,vr2,vr16
657 err4;   lvx     vr1,r4,r15
658         vperm   vr14,vr2,vr1,vr16
659 err4;   lvx     vr0,r4,r16
660         vperm   vr15,vr1,vr0,vr16
661         addi    r4,r4,128
662 err4;   stvx    vr8,r0,r3
663 err4;   stvx    vr9,r3,r9
664 err4;   stvx    vr10,r3,r10
665 err4;   stvx    vr11,r3,r11
666 err4;   stvx    vr12,r3,r12
667 err4;   stvx    vr13,r3,r14
668 err4;   stvx    vr14,r3,r15
669 err4;   stvx    vr15,r3,r16
670         addi    r3,r3,128
671         bdnz    8b
672
673         ld      r14,STK_REG(r14)(r1)
674         ld      r15,STK_REG(r15)(r1)
675         ld      r16,STK_REG(r16)(r1)
676
677         /* Up to 127B to go */
678         clrldi  r5,r5,(64-7)
679         srdi    r6,r5,4
680         mtocrf  0x01,r6
681
682         bf      cr7*4+1,9f
683 err3;   lvx     vr3,r0,r4
684         vperm   vr8,vr0,vr3,vr16
685 err3;   lvx     vr2,r4,r9
686         vperm   vr9,vr3,vr2,vr16
687 err3;   lvx     vr1,r4,r10
688         vperm   vr10,vr2,vr1,vr16
689 err3;   lvx     vr0,r4,r11
690         vperm   vr11,vr1,vr0,vr16
691         addi    r4,r4,64
692 err3;   stvx    vr8,r0,r3
693 err3;   stvx    vr9,r3,r9
694 err3;   stvx    vr10,r3,r10
695 err3;   stvx    vr11,r3,r11
696         addi    r3,r3,64
697
698 9:      bf      cr7*4+2,10f
699 err3;   lvx     vr1,r0,r4
700         vperm   vr8,vr0,vr1,vr16
701 err3;   lvx     vr0,r4,r9
702         vperm   vr9,vr1,vr0,vr16
703         addi    r4,r4,32
704 err3;   stvx    vr8,r0,r3
705 err3;   stvx    vr9,r3,r9
706         addi    r3,r3,32
707
708 10:     bf      cr7*4+3,11f
709 err3;   lvx     vr1,r0,r4
710         vperm   vr8,vr0,vr1,vr16
711         addi    r4,r4,16
712 err3;   stvx    vr8,r0,r3
713         addi    r3,r3,16
714
715         /* Up to 15B to go */
716 11:     clrldi  r5,r5,(64-4)
717         addi    r4,r4,-16       /* Unwind the +16 load offset */
718         mtocrf  0x01,r5
719         bf      cr7*4+0,12f
720 err3;   lwz     r0,0(r4)        /* Less chance of a reject with word ops */
721 err3;   lwz     r6,4(r4)
722         addi    r4,r4,8
723 err3;   stw     r0,0(r3)
724 err3;   stw     r6,4(r3)
725         addi    r3,r3,8
726
727 12:     bf      cr7*4+1,13f
728 err3;   lwz     r0,0(r4)
729         addi    r4,r4,4
730 err3;   stw     r0,0(r3)
731         addi    r3,r3,4
732
733 13:     bf      cr7*4+2,14f
734 err3;   lhz     r0,0(r4)
735         addi    r4,r4,2
736 err3;   sth     r0,0(r3)
737         addi    r3,r3,2
738
739 14:     bf      cr7*4+3,15f
740 err3;   lbz     r0,0(r4)
741 err3;   stb     r0,0(r3)
742
743 15:     addi    r1,r1,STACKFRAMESIZE
744         b       .exit_vmx_usercopy      /* tail call optimise */
745 #endif /* CONFiG_ALTIVEC */