Merge v2.6.37-rc8 into powerpc/next
[pandora-kernel.git] / arch / powerpc / lib / memcpy_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11
12         .align  7
13 _GLOBAL(memcpy)
14         std     r3,48(r1)       /* save destination pointer for return value */
15         PPC_MTOCRF      0x01,r5
16         cmpldi  cr1,r5,16
17         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
18         andi.   r6,r6,7
19         dcbt    0,r4
20         blt     cr1,.Lshort_copy
21 /* Below we want to nop out the bne if we're on a CPU that has the
22    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
23    cleared.
24    At the time of writing the only CPU that has this combination of bits
25    set is Power6. */
26 BEGIN_FTR_SECTION
27         nop
28 FTR_SECTION_ELSE
29         bne     .Ldst_unaligned
30 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
31                     CPU_FTR_UNALIGNED_LD_STD)
32 .Ldst_aligned:
33         addi    r3,r3,-16
34 BEGIN_FTR_SECTION
35         andi.   r0,r4,7
36         bne     .Lsrc_unaligned
37 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
38         srdi    r7,r5,4
39         ld      r9,0(r4)
40         addi    r4,r4,-8
41         mtctr   r7
42         andi.   r5,r5,7
43         bf      cr7*4+0,2f
44         addi    r3,r3,8
45         addi    r4,r4,8
46         mr      r8,r9
47         blt     cr1,3f
48 1:      ld      r9,8(r4)
49         std     r8,8(r3)
50 2:      ldu     r8,16(r4)
51         stdu    r9,16(r3)
52         bdnz    1b
53 3:      std     r8,8(r3)
54         beq     3f
55         addi    r3,r3,16
56 .Ldo_tail:
57         bf      cr7*4+1,1f
58         lwz     r9,8(r4)
59         addi    r4,r4,4
60         stw     r9,0(r3)
61         addi    r3,r3,4
62 1:      bf      cr7*4+2,2f
63         lhz     r9,8(r4)
64         addi    r4,r4,2
65         sth     r9,0(r3)
66         addi    r3,r3,2
67 2:      bf      cr7*4+3,3f
68         lbz     r9,8(r4)
69         stb     r9,0(r3)
70 3:      ld      r3,48(r1)       /* return dest pointer */
71         blr
72
73 .Lsrc_unaligned:
74         srdi    r6,r5,3
75         addi    r5,r5,-16
76         subf    r4,r0,r4
77         srdi    r7,r5,4
78         sldi    r10,r0,3
79         cmpdi   cr6,r6,3
80         andi.   r5,r5,7
81         mtctr   r7
82         subfic  r11,r10,64
83         add     r5,r5,r0
84
85         bt      cr7*4+0,0f
86
87         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
88         ld      r0,8(r4)
89         sld     r6,r9,r10
90         ldu     r9,16(r4)
91         srd     r7,r0,r11
92         sld     r8,r0,r10
93         or      r7,r7,r6
94         blt     cr6,4f
95         ld      r0,8(r4)
96         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
97         b       2f
98
99 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
100         ldu     r9,8(r4)
101         sld     r8,r0,r10
102         addi    r3,r3,-8
103         blt     cr6,5f
104         ld      r0,8(r4)
105         srd     r12,r9,r11
106         sld     r6,r9,r10
107         ldu     r9,16(r4)
108         or      r12,r8,r12
109         srd     r7,r0,r11
110         sld     r8,r0,r10
111         addi    r3,r3,16
112         beq     cr6,3f
113
114         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
115 1:      or      r7,r7,r6
116         ld      r0,8(r4)
117         std     r12,8(r3)
118 2:      srd     r12,r9,r11
119         sld     r6,r9,r10
120         ldu     r9,16(r4)
121         or      r12,r8,r12
122         stdu    r7,16(r3)
123         srd     r7,r0,r11
124         sld     r8,r0,r10
125         bdnz    1b
126
127 3:      std     r12,8(r3)
128         or      r7,r7,r6
129 4:      std     r7,16(r3)
130 5:      srd     r12,r9,r11
131         or      r12,r8,r12
132         std     r12,24(r3)
133         beq     4f
134         cmpwi   cr1,r5,8
135         addi    r3,r3,32
136         sld     r9,r9,r10
137         ble     cr1,6f
138         ld      r0,8(r4)
139         srd     r7,r0,r11
140         or      r9,r7,r9
141 6:
142         bf      cr7*4+1,1f
143         rotldi  r9,r9,32
144         stw     r9,0(r3)
145         addi    r3,r3,4
146 1:      bf      cr7*4+2,2f
147         rotldi  r9,r9,16
148         sth     r9,0(r3)
149         addi    r3,r3,2
150 2:      bf      cr7*4+3,3f
151         rotldi  r9,r9,8
152         stb     r9,0(r3)
153 3:      ld      r3,48(r1)       /* return dest pointer */
154         blr
155
156 .Ldst_unaligned:
157         PPC_MTOCRF      0x01,r6         # put #bytes to 8B bdry into cr7
158         subf    r5,r6,r5
159         li      r7,0
160         cmpldi  cr1,r5,16
161         bf      cr7*4+3,1f
162         lbz     r0,0(r4)
163         stb     r0,0(r3)
164         addi    r7,r7,1
165 1:      bf      cr7*4+2,2f
166         lhzx    r0,r7,r4
167         sthx    r0,r7,r3
168         addi    r7,r7,2
169 2:      bf      cr7*4+1,3f
170         lwzx    r0,r7,r4
171         stwx    r0,r7,r3
172 3:      PPC_MTOCRF      0x01,r5
173         add     r4,r6,r4
174         add     r3,r6,r3
175         b       .Ldst_aligned
176
177 .Lshort_copy:
178         bf      cr7*4+0,1f
179         lwz     r0,0(r4)
180         lwz     r9,4(r4)
181         addi    r4,r4,8
182         stw     r0,0(r3)
183         stw     r9,4(r3)
184         addi    r3,r3,8
185 1:      bf      cr7*4+1,2f
186         lwz     r0,0(r4)
187         addi    r4,r4,4
188         stw     r0,0(r3)
189         addi    r3,r3,4
190 2:      bf      cr7*4+2,3f
191         lhz     r0,0(r4)
192         addi    r4,r4,2
193         sth     r0,0(r3)
194         addi    r3,r3,2
195 3:      bf      cr7*4+3,4f
196         lbz     r0,0(r4)
197         stb     r0,0(r3)
198 4:      ld      r3,48(r1)       /* return dest pointer */
199         blr