--- /dev/null
+/*
+ * Copyright © 2006-2008, 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#ifdef __arm__
+
+.text
+.syntax unified
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.altmacro
+.p2align 2
+
+/******************************************************************************/
+
+.macro asm_function function_name
+ .global \function_name
+#ifdef __ELF__
+ .hidden \function_name
+ .type \function_name, %function
+#endif
+.func \function_name
+\function_name:
+.endm
+
+/******************************************************************************/
+
+/*
+ * writeback_scratch_to_mem_neon(int numbytes, void *dst, void *src)
+ *
+ * Copy a chunk of data from a cached scratch buffer (so prefetch is not
+ * really needed), to a memory buffer in forward direction. Generated from
+ * pixman macro templates.
+ */
+
+asm_function writeback_scratch_to_mem_neon
+ mov ip, r1
+ cmp r0, #32
+ blt 0f
+ tst ip, #15
+ beq 1f
+ tst ip, #1
+ beq 2f
+ vld1.8 {d0[1]}, [r2]!
+ add ip, ip, #1
+ sub r0, r0, #1
+2: tst ip, #2
+ beq 3f
+ vld1.8 {d0[2]}, [r2]!
+ vld1.8 {d0[3]}, [r2]!
+ add ip, ip, #2
+ sub r0, r0, #2
+3: tst ip, #4
+ beq 4f
+ vld1.8 {d0[4]}, [r2]!
+ vld1.8 {d0[5]}, [r2]!
+ vld1.8 {d0[6]}, [r2]!
+ vld1.8 {d0[7]}, [r2]!
+ add ip, ip, #4
+ sub r0, r0, #4
+4: tst ip, #8
+ beq 5f
+ vld1.8 {d1}, [r2]!
+ add ip, ip, #8
+ sub r0, r0, #8
+5: vld1.8 {d2-d3}, [r2]!
+ add ip, ip, #16
+ sub r0, r0, #16
+ tst r1, #1
+ beq 6f
+ vst1.8 {d0[1]}, [r1]!
+6: tst r1, #2
+ beq 7f
+ vst1.8 {d0[2]}, [r1]!
+ vst1.8 {d0[3]}, [r1]!
+7: tst r1, #4
+ beq 8f
+ vst1.8 {d0[4]}, [r1]!
+ vst1.8 {d0[5]}, [r1]!
+ vst1.8 {d0[6]}, [r1]!
+ vst1.8 {d0[7]}, [r1]!
+8: tst r1, #8
+ beq 9f
+ vst1.8 {d1}, [r1, :64]!
+9: vst1.8 {d2-d3}, [r1, :128]!
+1: subs r0, r0, #32
+ blt 10f
+ vld1.8 {d0-d3}, [r2]!
+ subs r0, r0, #32
+ blt 11f
+12: vst1.8 {d0-d3}, [r1, :128]!
+ vld1.8 {d0-d3}, [r2]!
+ subs r0, r0, #32
+ bge 12b
+11: vst1.8 {d0-d3}, [r1, :128]!
+10: tst r0, #31
+ beq 13f
+ tst r0, #16
+ beq 14f
+ vld1.8 {d2-d3}, [r2]!
+14: tst r0, #8
+ beq 15f
+ vld1.8 {d1}, [r2]!
+15: tst r0, #4
+ beq 16f
+ vld1.8 {d0[4]}, [r2]!
+ vld1.8 {d0[5]}, [r2]!
+ vld1.8 {d0[6]}, [r2]!
+ vld1.8 {d0[7]}, [r2]!
+16: tst r0, #2
+ beq 17f
+ vld1.8 {d0[2]}, [r2]!
+ vld1.8 {d0[3]}, [r2]!
+17: tst r0, #1
+ beq 18f
+ vld1.8 {d0[1]}, [r2]!
+18: tst r0, #16
+ beq 19f
+ vst1.8 {d2-d3}, [r1, :128]!
+19: tst r0, #8
+ beq 20f
+ vst1.8 {d1}, [r1, :64]!
+20: tst r0, #4
+ beq 21f
+ vst1.8 {d0[4]}, [r1]!
+ vst1.8 {d0[5]}, [r1]!
+ vst1.8 {d0[6]}, [r1]!
+ vst1.8 {d0[7]}, [r1]!
+21: tst r0, #2
+ beq 22f
+ vst1.8 {d0[2]}, [r1]!
+ vst1.8 {d0[3]}, [r1]!
+22: tst r0, #1
+ beq 13f
+ vst1.8 {d0[1]}, [r1]!
+13: bx lr
+0: tst r0, #31
+ beq 23f
+ tst r0, #16
+ beq 24f
+ vld1.8 {d2-d3}, [r2]!
+24: tst r0, #8
+ beq 25f
+ vld1.8 {d1}, [r2]!
+25: tst r0, #4
+ beq 26f
+ vld1.8 {d0[4]}, [r2]!
+ vld1.8 {d0[5]}, [r2]!
+ vld1.8 {d0[6]}, [r2]!
+ vld1.8 {d0[7]}, [r2]!
+26: tst r0, #2
+ beq 27f
+ vld1.8 {d0[2]}, [r2]!
+ vld1.8 {d0[3]}, [r2]!
+27: tst r0, #1
+ beq 28f
+ vld1.8 {d0[1]}, [r2]!
+28: tst r0, #16
+ beq 29f
+ vst1.8 {d2-d3}, [r1]!
+29: tst r0, #8
+ beq 30f
+ vst1.8 {d1}, [r1]!
+30: tst r0, #4
+ beq 31f
+ vst1.8 {d0[4]}, [r1]!
+ vst1.8 {d0[5]}, [r1]!
+ vst1.8 {d0[6]}, [r1]!
+ vst1.8 {d0[7]}, [r1]!
+31: tst r0, #2
+ beq 32f
+ vst1.8 {d0[2]}, [r1]!
+ vst1.8 {d0[3]}, [r1]!
+32: tst r0, #1
+ beq 23f
+ vst1.8 {d0[1]}, [r1]!
+23: bx lr
+.endfunc
+
+/******************************************************************************/
+
+/*
+ * Helper macro for memcpy function, it can copy data from source (r1) to
+ * destination (r0) buffers fixing alignment in the process. Destination
+ * buffer should be aligned already (4 bytes alignment is required.
+ * Size of the block to copy is in r2 register
+ */
+.macro UNALIGNED_MEMCPY shift
+ sub r1, #(\shift)
+ ldr ip, [r1], #4
+
+ tst r0, #4
+ movne r3, ip, lsr #(\shift * 8)
+ ldrne ip, [r1], #4
+ subne r2, r2, #4
+ orrne r3, r3, ip, asl #(32 - \shift * 8)
+ strne r3, [r0], #4
+
+ tst r0, #8
+ movne r3, ip, lsr #(\shift * 8)
+ ldmiane r1!, {r4, ip}
+ subne r2, r2, #8
+ orrne r3, r3, r4, asl #(32 - \shift * 8)
+ movne r4, r4, lsr #(\shift * 8)
+ orrne r4, r4, ip, asl #(32 - \shift * 8)
+ stmiane r0!, {r3-r4}
+ cmp r2, #32
+ blt 3f
+ pld [r1, #48]
+ stmfd sp!, {r7, r8, r9, r10, r11}
+ add r3, r1, #128
+ bic r3, r3, #31
+ sub r9, r3, r1
+1:
+ pld [r1, r9]
+ subs r2, r2, #32
+ movge r3, ip, lsr #(\shift * 8)
+ ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip}
+ orrge r3, r3, r4, asl #(32 - \shift * 8)
+ movge r4, r4, lsr #(\shift * 8)
+ orrge r4, r4, r5, asl #(32 - \shift * 8)
+ movge r5, r5, lsr #(\shift * 8)
+ orrge r5, r5, r6, asl #(32 - \shift * 8)
+ movge r6, r6, lsr #(\shift * 8)
+ orrge r6, r6, r7, asl #(32 - \shift * 8)
+ stmiage r0!, {r3-r6}
+ movge r7, r7, lsr #(\shift * 8)
+ orrge r7, r7, r8, asl #(32 - \shift * 8)
+ movge r8, r8, lsr #(\shift * 8)
+ orrge r8, r8, r10, asl #(32 - \shift * 8)
+ movge r10, r10, lsr #(\shift * 8)
+ orrge r10, r10, r11, asl #(32 - \shift * 8)
+ movge r11, r11, lsr #(\shift * 8)
+ orrge r11, r11, ip, asl #(32 - \shift * 8)
+ stmiage r0!, {r7, r8, r10, r11}
+ bgt 1b
+2:
+ ldmfd sp!, {r7, r8, r9, r10, r11}
+3: /* copy remaining data */
+ tst r2, #16
+ movne r3, ip, lsr #(\shift * 8)
+ ldmiane r1!, {r4-r6, ip}
+ orrne r3, r3, r4, asl #(32 - \shift * 8)
+ movne r4, r4, lsr #(\shift * 8)
+ orrne r4, r4, r5, asl #(32 - \shift * 8)
+ movge r5, r5, lsr #(\shift * 8)
+ orrge r5, r5, r6, asl #(32 - \shift * 8)
+ movge r6, r6, lsr #(\shift * 8)
+ orrge r6, r6, ip, asl #(32 - \shift * 8)
+ stmiane r0!, {r3-r6}
+
+ tst r2, #8
+ movne r3, ip, lsr #(\shift * 8)
+ ldmiane r1!, {r4, ip}
+ orrne r3, r3, r4, asl #(32 - \shift * 8)
+ movne r4, r4, lsr #(\shift * 8)
+ orrne r4, r4, ip, asl #(32 - \shift * 8)
+ stmiane r0!, {r3-r4}
+
+ tst r2, #4
+ movne r3, ip, lsr #(\shift * 8)
+ ldrne ip, [r1], #4
+ sub r1, r1, #(4 - \shift)
+ orrne r3, r3, ip, asl #(32 - \shift * 8)
+ strne r3, [r0], #4
+
+ tst r2, #2
+ ldrbne r3, [r1], #1
+ ldrbne r4, [r1], #1
+ ldr r5, [sp], #4
+ strbne r3, [r0], #1
+ strbne r4, [r0], #1
+
+ tst r2, #1
+ ldrbne r3, [r1], #1
+ ldr r6, [sp], #4
+ strbne r3, [r0], #1
+
+ ldmfd sp!, {r0, r4}
+
+ bx lr
+.endm
+
+/*
+ * Memcpy function with Raspberry Pi specific aligned prefetch, based on
+ * https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S
+ */
+asm_function memcpy_armv5te
+ cmp r2, #20
+ blt 9f
+ /* copy data until destination address is 4 bytes aligned */
+ tst r0, #1
+ ldrbne r3, [r1], #1
+ stmfd sp!, {r0, r4}
+ subne r2, r2, #1
+ strbne r3, [r0], #1
+ tst r0, #2
+ ldrbne r3, [r1], #1
+ ldrbne r4, [r1], #1
+ stmfd sp!, {r5, r6}
+ subne r2, r2, #2
+ orrne r3, r3, r4, asl #8
+ strhne r3, [r0], #2
+ /* destination address is 4 bytes aligned */
+ /* now we should handle 4 cases of source address alignment */
+ tst r1, #1
+ bne 6f
+ tst r1, #2
+ bne 7f
+
+ /* both source and destination are 4 bytes aligned */
+ stmfd sp!, {r7, r8, r9, r10, r11}
+ tst r0, #4
+ ldrne r4, [r1], #4
+ subne r2, r2, #4
+ strne r4, [r0], #4
+ tst r0, #8
+ ldmiane r1!, {r3-r4}
+ add r9, r1, #96
+ subne r2, r2, #8
+ bic r9, r9, #31
+ stmiane r0!, {r3-r4}
+ sub r9, r9, r1
+1:
+ subs r2, r2, #32
+ ldmiage r1!, {r3-r6, r7, r8, r10, r11}
+ pld [r1, r9]
+ stmiage r0!, {r3-r6}
+ stmiage r0!, {r7, r8, r10, r11}
+ bgt 1b
+2:
+ ldmfd sp!, {r7, r8, r9, r10, r11}
+ tst r2, #16
+ ldmiane r1!, {r3-r6}
+ stmiane r0!, {r3-r6}
+ tst r2, #8
+ ldmiane r1!, {r3-r4}
+ stmiane r0!, {r3-r4}
+ tst r2, #4
+ ldrne r3, [r1], #4
+ mov ip, r0
+ strne r3, [ip], #4
+ tst r2, #2
+ ldrhne r3, [r1], #2
+ ldmfd sp!, {r5, r6}
+ strhne r3, [ip], #2
+ tst r2, #1
+ ldrbne r3, [r1], #1
+ ldmfd sp!, {r0, r4}
+ strbne r3, [ip], #1
+
+ bx lr
+
+6:
+ tst r1, #2
+ bne 8f
+ UNALIGNED_MEMCPY 1
+7:
+ UNALIGNED_MEMCPY 2
+8:
+ UNALIGNED_MEMCPY 3
+9:
+ stmfd sp!, {r0, r4}
+1: subs r2, r2, #3
+ ldrbge ip, [r0]
+ ldrbge r3, [r1], #1
+ ldrbge r4, [r1], #1
+ ldrbge ip, [r1], #1
+ strbge r3, [r0], #1
+ strbge r4, [r0], #1
+ strbge ip, [r0], #1
+ bge 1b
+ adds r2, r2, #2
+ ldrbge r3, [r1], #1
+ mov ip, r0
+ ldr r0, [sp], #4
+ strbge r3, [ip], #1
+ ldrbgt r3, [r1], #1
+ ldr r4, [sp], #4
+ strbgt r3, [ip], #1
+ bx lr
+.endfunc
+
+/******************************************************************************/
+
+/*
+ * aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem)
+ *
+ * Both 'scratch' and 'fbmem' pointers must be 32 bytes aligned.
+ * The value in 'numbytes' is also rounded up to a multiple of 32 bytes.
+
+ * The only purpose of this code is to attempt minimizing penalty incured
+ * by doing uncached reads from memory (for example framebuffer). We are
+ * trying to do the largest possible perfectly aligned reads to fetch
+ * data into a temporary scratch buffer in L1 cache.
+ */
+
+asm_function aligned_fetch_fbmem_to_scratch_neon
+ SIZE .req r0
+ DST .req r1
+ SRC .req r2
+
+ subs SIZE, #128
+ blt 1f
+0:
+ /* aligned load from the source (framebuffer) */
+ vld1.64 {q0, q1}, [SRC, :256]!
+ vld1.64 {q2, q3}, [SRC, :256]!
+ vld1.64 {q8, q9}, [SRC, :256]!
+ vld1.64 {q10, q11}, [SRC, :256]!
+ /* fetch destination (scratch buffer) into L1 cache */
+ ldr r3, [DST]
+ ldr ip, [DST, #64]
+ /* aligned store to the scratch buffer */
+ vst1.64 {q0, q1}, [DST, :256]!
+ vst1.64 {q2, q3}, [DST, :256]!
+ vst1.64 {q8, q9}, [DST, :256]!
+ vst1.64 {q10, q11}, [DST, :256]!
+ subs SIZE, SIZE, #128
+ bge 0b
+1:
+ tst SIZE, #64
+ beq 1f
+ vld1.64 {q0, q1}, [SRC, :256]!
+ vld1.64 {q2, q3}, [SRC, :256]!
+ ldr r3, [DST]
+ vst1.64 {q0, q1}, [DST, :256]!
+ vst1.64 {q2, q3}, [DST, :256]!
+1:
+ tst SIZE, #32
+ beq 1f
+ vld1.64 {q0, q1}, [SRC, :256]!
+ vst1.64 {q0, q1}, [DST, :256]!
+1:
+ tst SIZE, #31
+ beq 1f
+ vld1.64 {q0, q1}, [SRC, :256]!
+ vst1.64 {q0, q1}, [DST, :256]!
+1:
+ bx lr
+
+ .unreq SIZE
+ .unreq DST
+ .unreq SRC
+.endfunc
+
+asm_function aligned_fetch_fbmem_to_scratch_vfp
+ SIZE .req r0
+ DST .req r1
+ SRC .req r2
+
+ vpush {d8-d15}
+ subs SIZE, #128
+ blt 1f
+0:
+ /* aligned load from the source (framebuffer) */
+ vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
+ /* aligned store to the scratch buffer */
+ vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15}
+ subs SIZE, SIZE, #128
+ bge 0b
+1:
+ tst SIZE, #64
+ beq 1f
+ vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7}
+ vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7}
+1:
+ tst SIZE, #32
+ beq 1f
+ vldm SRC!, {d0, d1, d2, d3}
+ vstm DST!, {d0, d1, d2, d3}
+1:
+ tst SIZE, #31
+ beq 1f
+ vldm SRC!, {d0, d1, d2, d3}
+ vstm DST!, {d0, d1, d2, d3}
+1:
+ vpop {d8-d15}
+ bx lr
+
+ .unreq SIZE
+ .unreq DST
+ .unreq SRC
+.endfunc
+
+asm_function aligned_fetch_fbmem_to_scratch_arm
+ SIZE .req r0
+ DST .req r1
+ SRC .req r2
+
+ push {r4-r11, lr}
+ subs SIZE, #128
+ blt 1f
+0:
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+ subs SIZE, SIZE, #128
+ bge 0b
+1:
+ tst SIZE, #64
+ beq 1f
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+1:
+ tst SIZE, #32
+ beq 1f
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+1:
+ tst SIZE, #31
+ beq 1f
+ ldmia SRC!, {r4-r11}
+ stmia DST!, {r4-r11}
+1:
+ pop {r4-r11, pc}
+
+ .unreq SIZE
+ .unreq DST
+ .unreq SRC
+.endfunc
+
+#endif
--- /dev/null
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <inc_libc64_mini.h>
+
+void aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, const void *fbmem);
+void aligned_fetch_fbmem_to_scratch_vfp (int numbytes, void *scratch, const void *fbmem);
+
+#define TEST_SIZE (1024 * 1024)
+#define TRIES 16
+
+static char dummy [TEST_SIZE] __attribute__((aligned(64)));
+static char dummy2[TEST_SIZE] __attribute__((aligned(64)));
+
+static inline void pcnt_init(void)
+{
+ int v;
+ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v));
+ v |= 5; // master enable, ccnt reset
+ v &= ~8; // ccnt divider 0
+ asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v));
+ // enable cycle counter
+ asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31));
+}
+
+static inline unsigned int pcnt_get(void)
+{
+ unsigned int val;
+ __asm__ volatile("mrc p15, 0, %0, c9, c13, 0"
+ : "=r"(val));
+ return val;
+}
+
+#define make_rd_test(name, type) \
+static int name(const void *mem_, size_t size) \
+{ \
+ const type *mem = mem_; \
+ int sum = 0; \
+ \
+ size /= sizeof(*mem); \
+ while (size-- > 0) \
+ sum += *mem++; \
+ \
+ return sum; \
+}
+
+make_rd_test(read_c_8, int8_t)
+make_rd_test(read_c_16, int16_t)
+make_rd_test(read_c_32, int32_t)
+make_rd_test(read_c_64, int64_t)
+
+static int read_ldrd(const void *mem, size_t size)
+{
+ size /= 8;
+ asm volatile(
+ "0: ldrd r2, r3, [%0], #8\n"
+ " subs %1, #1\n"
+ " bgt 0b\n"
+ : "=&r"(mem), "=&r"(size)
+ : "0"(mem), "1"(size)
+ : "r2", "r3", "cc");
+ return 0;
+}
+
+static int read_ldrd_pld(const void *mem, size_t size)
+{
+ size /= 8;
+ asm volatile(
+ "0: ldrd r2, r3, [%0], #8\n"
+ " subs %1, #1\n"
+ " pld [%0, #64*4]\n"
+ " bgt 0b\n"
+ : "=&r"(mem), "=&r"(size)
+ : "0"(mem), "1"(size)
+ : "r2", "r3", "cc");
+ return 0;
+}
+
+static int g_skip;
+static int read_c_32_skip(const void *mem_, size_t size)
+{
+ const int *mem = mem_;
+ int skip = g_skip / 4;
+ int sum = 0;
+ size_t i;
+
+ size /= 4;
+ for (i = 0; i < size; i += skip)
+ sum += mem[i];
+
+ return sum;
+}
+
+static int read_fbt_neon(const void *mem, size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i += 256)
+ aligned_fetch_fbmem_to_scratch_neon(256, dummy2, mem + i);
+
+ return 0;
+}
+
+static int read_fbt_vfp(const void *mem, size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i += 256)
+ aligned_fetch_fbmem_to_scratch_vfp(256, dummy2, mem + i);
+
+ return 0;
+}
+
+static unsigned int run(const char *name,
+ void (*inv)(const void *mem, size_t size),
+ int (*test)(const void *mem, size_t size),
+ const void *mem, unsigned int baseline)
+{
+ unsigned int i, cycles, smallest = ~0;
+
+ for (i = 0; i < TRIES; i++) {
+ cycles = pcnt_get();
+ if (inv)
+ inv(mem, TEST_SIZE);
+ test(mem, TEST_SIZE);
+ cycles = pcnt_get() - cycles;
+
+ if (cycles < smallest)
+ smallest = cycles;
+ }
+
+ printf("%-10s %6uk", name, smallest / 1000);
+ if (baseline != 0)
+ printf(" %5lld%%", smallest * 100ull / baseline);
+ printf("\n");
+
+ return smallest;
+}
+
+static void run_all(const char *name,
+ void (*inv)(const void *mem, size_t size),
+ const void *mem)
+{
+ static unsigned int b[16];
+ unsigned int r[16];
+ int t = 0;
+
+ printf("%s\n", name);
+ r[t] = run(" 8", inv, read_c_8, mem, b[t]); t++;
+ r[t] = run(" 16", inv, read_c_16, mem, b[t]); t++;
+ r[t] = run(" 32", inv, read_c_32, mem, b[t]); t++;
+ r[t] = run(" 64", inv, read_c_64, mem, b[t]); t++;
+ g_skip = 32;
+ r[t] = run(" 32_s32", inv, read_c_32_skip, mem, b[t]); t++;
+ g_skip = 64;
+ r[t] = run(" 32_s64", inv, read_c_32_skip, mem, b[t]); t++;
+ r[t] = run(" ldrd", inv, read_ldrd, mem, b[t]); t++;
+ r[t] = run(" ldrd pld", inv, read_ldrd_pld, mem, b[t]); t++;
+ r[t] = run(" fbt neon", inv, read_fbt_neon, mem, b[t]); t++;
+ r[t] = run(" fbt vfp", inv, read_fbt_vfp, mem, b[t]); t++;
+
+ if (b[0] == 0)
+ memcpy(b, r, sizeof(b));
+}
+
+static void shm_inv(const void *mem, size_t size)
+{
+ dsp_cache_inv_virt((void *)mem, size);
+}
+
+static void run_shm(const char *name, dsp_cache_t ct, int use_inv)
+{
+ dsp_mem_region_t region;
+ void *mem;
+
+ region = dsp_shm_alloc(ct, TEST_SIZE);
+ if (region.size < TEST_SIZE || region.virt_addr == 0) {
+ fprintf(stderr, "dsp_shm_alloc failed\n");
+ return;
+ }
+ mem = (void *)region.virt_addr;
+ // printf("mapped %d %p\n", ct, mem);
+
+ run_all(name, use_inv ? shm_inv : NULL, mem);
+
+ dsp_shm_free(region);
+}
+
+int main()
+{
+ int ret;
+
+ // prefault
+ memset(dummy, 1, sizeof(dummy));
+ memset(dummy2, 1, sizeof(dummy2));
+ printf("pid: %d dummy: %p\n", (int)getpid(), dummy);
+
+ pcnt_init();
+
+ run_all(".bss", NULL, dummy);
+
+ ret = dsp_open();
+ if (ret != 0) {
+ fprintf(stderr, "dsp_open %d\n", ret);
+ return 1;
+ }
+
+ run_shm("shm wb", DSP_CACHE_RW, 0);
+ //run_shm("shm wt", DSP_CACHE_R, 0);
+ run_shm("shm nc", DSP_CACHE_W, 0);
+ //run_shm("shm wb inv", DSP_CACHE_RW, 1);
+ run_shm("shm wt inv", DSP_CACHE_R, 1);
+
+ dsp_close();
+
+ return 0;
+}