2 * Copyright © 2006-2008, 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
24 /* Prevent the stack from becoming executable */
25 #if defined(__linux__) && defined(__ELF__)
26 .section .note.GNU-stack,"",%progbits
40 /******************************************************************************/
42 .macro asm_function function_name
43 .global \function_name
45 .hidden \function_name
46 .type \function_name, %function
52 /******************************************************************************/
55 * writeback_scratch_to_mem_neon(int numbytes, void *dst, void *src)
57 * Copy a chunk of data from a cached scratch buffer (so prefetch is not
58 * really needed), to a memory buffer in forward direction. Generated from
59 * pixman macro templates.
62 asm_function writeback_scratch_to_mem_neon
92 5: vld1.8 {d2-d3}, [r2]!
100 vst1.8 {d0[2]}, [r1]!
101 vst1.8 {d0[3]}, [r1]!
104 vst1.8 {d0[4]}, [r1]!
105 vst1.8 {d0[5]}, [r1]!
106 vst1.8 {d0[6]}, [r1]!
107 vst1.8 {d0[7]}, [r1]!
110 vst1.8 {d1}, [r1, :64]!
111 9: vst1.8 {d2-d3}, [r1, :128]!
114 vld1.8 {d0-d3}, [r2]!
117 12: vst1.8 {d0-d3}, [r1, :128]!
118 vld1.8 {d0-d3}, [r2]!
121 11: vst1.8 {d0-d3}, [r1, :128]!
126 vld1.8 {d2-d3}, [r2]!
132 vld1.8 {d0[4]}, [r2]!
133 vld1.8 {d0[5]}, [r2]!
134 vld1.8 {d0[6]}, [r2]!
135 vld1.8 {d0[7]}, [r2]!
138 vld1.8 {d0[2]}, [r2]!
139 vld1.8 {d0[3]}, [r2]!
142 vld1.8 {d0[1]}, [r2]!
145 vst1.8 {d2-d3}, [r1, :128]!
148 vst1.8 {d1}, [r1, :64]!
151 vst1.8 {d0[4]}, [r1]!
152 vst1.8 {d0[5]}, [r1]!
153 vst1.8 {d0[6]}, [r1]!
154 vst1.8 {d0[7]}, [r1]!
157 vst1.8 {d0[2]}, [r1]!
158 vst1.8 {d0[3]}, [r1]!
161 vst1.8 {d0[1]}, [r1]!
167 vld1.8 {d2-d3}, [r2]!
173 vld1.8 {d0[4]}, [r2]!
174 vld1.8 {d0[5]}, [r2]!
175 vld1.8 {d0[6]}, [r2]!
176 vld1.8 {d0[7]}, [r2]!
179 vld1.8 {d0[2]}, [r2]!
180 vld1.8 {d0[3]}, [r2]!
183 vld1.8 {d0[1]}, [r2]!
186 vst1.8 {d2-d3}, [r1]!
192 vst1.8 {d0[4]}, [r1]!
193 vst1.8 {d0[5]}, [r1]!
194 vst1.8 {d0[6]}, [r1]!
195 vst1.8 {d0[7]}, [r1]!
198 vst1.8 {d0[2]}, [r1]!
199 vst1.8 {d0[3]}, [r1]!
202 vst1.8 {d0[1]}, [r1]!
206 /******************************************************************************/
209 * Helper macro for memcpy function, it can copy data from source (r1) to
210 * destination (r0) buffers fixing alignment in the process. Destination
211 * buffer should be aligned already (4 bytes alignment is required.
212 * Size of the block to copy is in r2 register
214 .macro UNALIGNED_MEMCPY shift
219 movne r3, ip, lsr #(\shift * 8)
222 orrne r3, r3, ip, asl #(32 - \shift * 8)
226 movne r3, ip, lsr #(\shift * 8)
227 ldmiane r1!, {r4, ip}
229 orrne r3, r3, r4, asl #(32 - \shift * 8)
230 movne r4, r4, lsr #(\shift * 8)
231 orrne r4, r4, ip, asl #(32 - \shift * 8)
236 stmfd sp!, {r7, r8, r9, r10, r11}
243 movge r3, ip, lsr #(\shift * 8)
244 ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip}
245 orrge r3, r3, r4, asl #(32 - \shift * 8)
246 movge r4, r4, lsr #(\shift * 8)
247 orrge r4, r4, r5, asl #(32 - \shift * 8)
248 movge r5, r5, lsr #(\shift * 8)
249 orrge r5, r5, r6, asl #(32 - \shift * 8)
250 movge r6, r6, lsr #(\shift * 8)
251 orrge r6, r6, r7, asl #(32 - \shift * 8)
253 movge r7, r7, lsr #(\shift * 8)
254 orrge r7, r7, r8, asl #(32 - \shift * 8)
255 movge r8, r8, lsr #(\shift * 8)
256 orrge r8, r8, r10, asl #(32 - \shift * 8)
257 movge r10, r10, lsr #(\shift * 8)
258 orrge r10, r10, r11, asl #(32 - \shift * 8)
259 movge r11, r11, lsr #(\shift * 8)
260 orrge r11, r11, ip, asl #(32 - \shift * 8)
261 stmiage r0!, {r7, r8, r10, r11}
264 ldmfd sp!, {r7, r8, r9, r10, r11}
265 3: /* copy remaining data */
267 movne r3, ip, lsr #(\shift * 8)
268 ldmiane r1!, {r4-r6, ip}
269 orrne r3, r3, r4, asl #(32 - \shift * 8)
270 movne r4, r4, lsr #(\shift * 8)
271 orrne r4, r4, r5, asl #(32 - \shift * 8)
272 movge r5, r5, lsr #(\shift * 8)
273 orrge r5, r5, r6, asl #(32 - \shift * 8)
274 movge r6, r6, lsr #(\shift * 8)
275 orrge r6, r6, ip, asl #(32 - \shift * 8)
279 movne r3, ip, lsr #(\shift * 8)
280 ldmiane r1!, {r4, ip}
281 orrne r3, r3, r4, asl #(32 - \shift * 8)
282 movne r4, r4, lsr #(\shift * 8)
283 orrne r4, r4, ip, asl #(32 - \shift * 8)
287 movne r3, ip, lsr #(\shift * 8)
289 sub r1, r1, #(4 - \shift)
290 orrne r3, r3, ip, asl #(32 - \shift * 8)
311 * Memcpy function with Raspberry Pi specific aligned prefetch, based on
312 * https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S
314 asm_function memcpy_armv5te
317 /* copy data until destination address is 4 bytes aligned */
328 orrne r3, r3, r4, asl #8
330 /* destination address is 4 bytes aligned */
331 /* now we should handle 4 cases of source address alignment */
337 /* both source and destination are 4 bytes aligned */
338 stmfd sp!, {r7, r8, r9, r10, r11}
352 ldmiage r1!, {r3-r6, r7, r8, r10, r11}
355 stmiage r0!, {r7, r8, r10, r11}
358 ldmfd sp!, {r7, r8, r9, r10, r11}
410 /******************************************************************************/
413 * aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem)
415 * Both 'scratch' and 'fbmem' pointers must be 32 bytes aligned.
416 * The value in 'numbytes' is also rounded up to a multiple of 32 bytes.
418 * The only purpose of this code is to attempt minimizing penalty incured
419 * by doing uncached reads from memory (for example framebuffer). We are
420 * trying to do the largest possible perfectly aligned reads to fetch
421 * data into a temporary scratch buffer in L1 cache.
424 asm_function aligned_fetch_fbmem_to_scratch_neon
432 /* aligned load from the source (framebuffer) */
433 vld1.64 {q0, q1}, [SRC, :256]!
434 vld1.64 {q2, q3}, [SRC, :256]!
435 vld1.64 {q8, q9}, [SRC, :256]!
436 vld1.64 {q10, q11}, [SRC, :256]!
437 /* fetch destination (scratch buffer) into L1 cache */
440 /* aligned store to the scratch buffer */
441 vst1.64 {q0, q1}, [DST, :256]!
442 vst1.64 {q2, q3}, [DST, :256]!
443 vst1.64 {q8, q9}, [DST, :256]!
444 vst1.64 {q10, q11}, [DST, :256]!
445 subs SIZE, SIZE, #128
450 vld1.64 {q0, q1}, [SRC, :256]!
451 vld1.64 {q2, q3}, [SRC, :256]!
453 vst1.64 {q0, q1}, [DST, :256]!
454 vst1.64 {q2, q3}, [DST, :256]!
458 vld1.64 {q0, q1}, [SRC, :256]!
459 vst1.64 {q0, q1}, [DST, :256]!
463 vld1.64 {q0, q1}, [SRC, :256]!
464 vst1.64 {q0, q1}, [DST, :256]!
473 asm_function aligned_fetch_fbmem_to_scratch_vfp
482 /* aligned load from the source (framebuffer) */
483 vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
484 /* aligned store to the scratch buffer */
485 vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
486 subs SIZE, SIZE, #128
491 vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7}
492 vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7}
496 vldm SRC!, {d0, d1, d2, d3}
497 vstm DST!, {d0, d1, d2, d3}
501 vldm SRC!, {d0, d1, d2, d3}
502 vstm DST!, {d0, d1, d2, d3}
512 asm_function aligned_fetch_fbmem_to_scratch_arm
529 subs SIZE, SIZE, #128