2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16 * Copyright (C) IBM Corporation, 2011
18 * Author: Anton Blanchard <anton@au.ibm.com>
20 #include <asm/ppc_asm.h>
22 #define STACKFRAMESIZE 256
23 #define STK_REG(i) (112 + ((i)-14)*8)
27 .section __ex_table,"a"
35 .section __ex_table,"a"
44 .section __ex_table,"a"
52 .section __ex_table,"a"
60 ld r16,STK_REG(r16)(r1)
61 ld r15,STK_REG(r15)(r1)
62 ld r14,STK_REG(r14)(r1)
65 ld r0,STACKFRAMESIZE+16(r1)
68 #endif /* CONFIG_ALTIVEC */
71 ld r22,STK_REG(r22)(r1)
72 ld r21,STK_REG(r21)(r1)
73 ld r20,STK_REG(r20)(r1)
74 ld r19,STK_REG(r19)(r1)
75 ld r18,STK_REG(r18)(r1)
76 ld r17,STK_REG(r17)(r1)
77 ld r16,STK_REG(r16)(r1)
78 ld r15,STK_REG(r15)(r1)
79 ld r14,STK_REG(r14)(r1)
81 addi r1,r1,STACKFRAMESIZE
86 b __copy_tofrom_user_base
89 _GLOBAL(__copy_tofrom_user_power7)
111 /* Get the source 8B aligned */
139 stdu r1,-STACKFRAMESIZE(r1)
140 std r14,STK_REG(r14)(r1)
141 std r15,STK_REG(r15)(r1)
142 std r16,STK_REG(r16)(r1)
143 std r17,STK_REG(r17)(r1)
144 std r18,STK_REG(r18)(r1)
145 std r19,STK_REG(r19)(r1)
146 std r20,STK_REG(r20)(r1)
147 std r21,STK_REG(r21)(r1)
148 std r22,STK_REG(r22)(r1)
149 std r0,STACKFRAMESIZE+16(r1)
154 /* Now do cacheline (128B) sized loads and stores. */
187 err2; std r19,104(r3)
188 err2; std r20,112(r3)
189 err2; std r21,120(r3)
195 ld r14,STK_REG(r14)(r1)
196 ld r15,STK_REG(r15)(r1)
197 ld r16,STK_REG(r16)(r1)
198 ld r17,STK_REG(r17)(r1)
199 ld r18,STK_REG(r18)(r1)
200 ld r19,STK_REG(r19)(r1)
201 ld r20,STK_REG(r20)(r1)
202 ld r21,STK_REG(r21)(r1)
203 ld r22,STK_REG(r22)(r1)
204 addi r1,r1,STACKFRAMESIZE
206 /* Up to 127B to go */
230 /* Up to 63B to go */
243 /* Up to 31B to go */
252 9: clrldi r5,r5,(64-4)
254 /* Up to 15B to go */
258 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
284 .Lunwind_stack_nonvmx_copy:
285 addi r1,r1,STACKFRAMESIZE
288 #ifdef CONFIG_ALTIVEC
292 stdu r1,-STACKFRAMESIZE(r1)
293 bl .enter_vmx_usercopy
295 ld r0,STACKFRAMESIZE+16(r1)
296 ld r3,STACKFRAMESIZE+48(r1)
297 ld r4,STACKFRAMESIZE+56(r1)
298 ld r5,STACKFRAMESIZE+64(r1)
302 * We prefetch both the source and destination using enhanced touch
303 * instructions. We use a stream ID of 0 for the load side and
304 * 1 for the store side.
308 ori r9,r9,1 /* stream=1 */
310 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
314 1: lis r0,0x0E00 /* depth=7 */
317 ori r10,r7,1 /* stream=1 */
319 lis r8,0x8000 /* GO=1 */
327 dcbtst r0,r10,0b01010
329 dcbt r0,r8,0b01010 /* GO */
333 * We prefetch both the source and destination using enhanced touch
334 * instructions. We use a stream ID of 0 for the load side and
335 * 1 for the store side.
339 ori r9,r9,1 /* stream=1 */
341 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
345 1: lis r0,0x0E00 /* depth=7 */
348 ori r10,r7,1 /* stream=1 */
350 lis r8,0x8000 /* GO=1 */
358 dcbtst r0,r10,0b01010
360 dcbt r0,r8,0b01010 /* GO */
363 beq .Lunwind_stack_nonvmx_copy
366 * If source and destination are not relatively aligned we use a
367 * slower permute loop.
370 rldicl. r6,r6,0,(64-4)
371 bne .Lvmx_unaligned_copy
373 /* Get the destination 16B aligned */
404 /* Get the desination 128B aligned */
436 err3; stvx vr1,r3,r10
437 err3; stvx vr0,r3,r11
443 std r14,STK_REG(r14)(r1)
444 std r15,STK_REG(r15)(r1)
445 std r16,STK_REG(r16)(r1)
455 * Now do cacheline sized loads and stores. By this stage the
456 * cacheline stores are also cacheline aligned.
471 err4; stvx vr5,r3,r10
472 err4; stvx vr4,r3,r11
473 err4; stvx vr3,r3,r12
474 err4; stvx vr2,r3,r14
475 err4; stvx vr1,r3,r15
476 err4; stvx vr0,r3,r16
480 ld r14,STK_REG(r14)(r1)
481 ld r15,STK_REG(r15)(r1)
482 ld r16,STK_REG(r16)(r1)
484 /* Up to 127B to go */
497 err3; stvx vr1,r3,r10
498 err3; stvx vr0,r3,r11
515 /* Up to 15B to go */
516 11: clrldi r5,r5,(64-4)
540 15: addi r1,r1,STACKFRAMESIZE
541 b .exit_vmx_usercopy /* tail call optimise */
543 .Lvmx_unaligned_copy:
544 /* Get the destination 16B aligned */
568 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
577 /* Get the desination 128B aligned */
587 lvsl vr16,0,r4 /* Setup permute control vector */
593 vperm vr8,vr0,vr1,vr16
601 vperm vr8,vr0,vr1,vr16
603 vperm vr9,vr1,vr0,vr16
611 vperm vr8,vr0,vr3,vr16
613 vperm vr9,vr3,vr2,vr16
615 vperm vr10,vr2,vr1,vr16
617 vperm vr11,vr1,vr0,vr16
621 err3; stvx vr10,r3,r10
622 err3; stvx vr11,r3,r11
628 std r14,STK_REG(r14)(r1)
629 std r15,STK_REG(r15)(r1)
630 std r16,STK_REG(r16)(r1)
640 * Now do cacheline sized loads and stores. By this stage the
641 * cacheline stores are also cacheline aligned.
646 vperm vr8,vr0,vr7,vr16
648 vperm vr9,vr7,vr6,vr16
650 vperm vr10,vr6,vr5,vr16
652 vperm vr11,vr5,vr4,vr16
654 vperm vr12,vr4,vr3,vr16
656 vperm vr13,vr3,vr2,vr16
658 vperm vr14,vr2,vr1,vr16
660 vperm vr15,vr1,vr0,vr16
664 err4; stvx vr10,r3,r10
665 err4; stvx vr11,r3,r11
666 err4; stvx vr12,r3,r12
667 err4; stvx vr13,r3,r14
668 err4; stvx vr14,r3,r15
669 err4; stvx vr15,r3,r16
673 ld r14,STK_REG(r14)(r1)
674 ld r15,STK_REG(r15)(r1)
675 ld r16,STK_REG(r16)(r1)
677 /* Up to 127B to go */
684 vperm vr8,vr0,vr3,vr16
686 vperm vr9,vr3,vr2,vr16
688 vperm vr10,vr2,vr1,vr16
690 vperm vr11,vr1,vr0,vr16
694 err3; stvx vr10,r3,r10
695 err3; stvx vr11,r3,r11
700 vperm vr8,vr0,vr1,vr16
702 vperm vr9,vr1,vr0,vr16
710 vperm vr8,vr0,vr1,vr16
715 /* Up to 15B to go */
716 11: clrldi r5,r5,(64-4)
717 addi r4,r4,-16 /* Unwind the +16 load offset */
720 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
743 15: addi r1,r1,STACKFRAMESIZE
744 b .exit_vmx_usercopy /* tail call optimise */
745 #endif /* CONFiG_ALTIVEC */