From: Grazvydas Ignotas Date: Sat, 3 Jan 2015 18:32:35 +0000 (+0200) Subject: tests: add uncached memory test X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-misc.git;a=commitdiff_plain;h=5cee7704de91bd08fe38b2f6991980ec2a53a0c9 tests: add uncached memory test --- diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..9cab9d1 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,6 @@ +*.o +*.cmd +*.mod.c +.tmp_versions +Module.symvers +modules.order diff --git a/tests/Makefile b/tests/Makefile index b9f78a8..d1b7646 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -14,5 +14,8 @@ wlstat : LDFLAGS += -lrt memspeed : memspeed.c neoncpy.S $(CC) $(CFLAGS) $^ -o $@ +test_uncached : test_uncached.c neoncpy.S fbturbo_asm.S + $(CC) $(CFLAGS) $^ -o $@ -lc64 -I$(C64T)/include -L$(C64T)/libc64/ + %: %.c $(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/tests/fbturbo_asm.S b/tests/fbturbo_asm.S new file mode 100644 index 0000000..2bd91a7 --- /dev/null +++ b/tests/fbturbo_asm.S @@ -0,0 +1,556 @@ +/* + * Copyright © 2006-2008, 2013 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* Prevent the stack from becoming executable */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#ifdef __arm__ + +.text +.syntax unified +.fpu neon +.arch armv7a +.object_arch armv4 +.arm +.altmacro +.p2align 2 + +/******************************************************************************/ + +.macro asm_function function_name + .global \function_name +#ifdef __ELF__ + .hidden \function_name + .type \function_name, %function +#endif +.func \function_name +\function_name: +.endm + +/******************************************************************************/ + +/* + * writeback_scratch_to_mem_neon(int numbytes, void *dst, void *src) + * + * Copy a chunk of data from a cached scratch buffer (so prefetch is not + * really needed), to a memory buffer in forward direction. Generated from + * pixman macro templates. + */ + +asm_function writeback_scratch_to_mem_neon + mov ip, r1 + cmp r0, #32 + blt 0f + tst ip, #15 + beq 1f + tst ip, #1 + beq 2f + vld1.8 {d0[1]}, [r2]! + add ip, ip, #1 + sub r0, r0, #1 +2: tst ip, #2 + beq 3f + vld1.8 {d0[2]}, [r2]! + vld1.8 {d0[3]}, [r2]! + add ip, ip, #2 + sub r0, r0, #2 +3: tst ip, #4 + beq 4f + vld1.8 {d0[4]}, [r2]! + vld1.8 {d0[5]}, [r2]! + vld1.8 {d0[6]}, [r2]! + vld1.8 {d0[7]}, [r2]! + add ip, ip, #4 + sub r0, r0, #4 +4: tst ip, #8 + beq 5f + vld1.8 {d1}, [r2]! + add ip, ip, #8 + sub r0, r0, #8 +5: vld1.8 {d2-d3}, [r2]! + add ip, ip, #16 + sub r0, r0, #16 + tst r1, #1 + beq 6f + vst1.8 {d0[1]}, [r1]! +6: tst r1, #2 + beq 7f + vst1.8 {d0[2]}, [r1]! + vst1.8 {d0[3]}, [r1]! +7: tst r1, #4 + beq 8f + vst1.8 {d0[4]}, [r1]! + vst1.8 {d0[5]}, [r1]! + vst1.8 {d0[6]}, [r1]! + vst1.8 {d0[7]}, [r1]! +8: tst r1, #8 + beq 9f + vst1.8 {d1}, [r1, :64]! +9: vst1.8 {d2-d3}, [r1, :128]! +1: subs r0, r0, #32 + blt 10f + vld1.8 {d0-d3}, [r2]! + subs r0, r0, #32 + blt 11f +12: vst1.8 {d0-d3}, [r1, :128]! + vld1.8 {d0-d3}, [r2]! + subs r0, r0, #32 + bge 12b +11: vst1.8 {d0-d3}, [r1, :128]! +10: tst r0, #31 + beq 13f + tst r0, #16 + beq 14f + vld1.8 {d2-d3}, [r2]! +14: tst r0, #8 + beq 15f + vld1.8 {d1}, [r2]! +15: tst r0, #4 + beq 16f + vld1.8 {d0[4]}, [r2]! + vld1.8 {d0[5]}, [r2]! + vld1.8 {d0[6]}, [r2]! + vld1.8 {d0[7]}, [r2]! +16: tst r0, #2 + beq 17f + vld1.8 {d0[2]}, [r2]! + vld1.8 {d0[3]}, [r2]! +17: tst r0, #1 + beq 18f + vld1.8 {d0[1]}, [r2]! +18: tst r0, #16 + beq 19f + vst1.8 {d2-d3}, [r1, :128]! +19: tst r0, #8 + beq 20f + vst1.8 {d1}, [r1, :64]! +20: tst r0, #4 + beq 21f + vst1.8 {d0[4]}, [r1]! + vst1.8 {d0[5]}, [r1]! + vst1.8 {d0[6]}, [r1]! + vst1.8 {d0[7]}, [r1]! +21: tst r0, #2 + beq 22f + vst1.8 {d0[2]}, [r1]! + vst1.8 {d0[3]}, [r1]! +22: tst r0, #1 + beq 13f + vst1.8 {d0[1]}, [r1]! +13: bx lr +0: tst r0, #31 + beq 23f + tst r0, #16 + beq 24f + vld1.8 {d2-d3}, [r2]! +24: tst r0, #8 + beq 25f + vld1.8 {d1}, [r2]! +25: tst r0, #4 + beq 26f + vld1.8 {d0[4]}, [r2]! + vld1.8 {d0[5]}, [r2]! + vld1.8 {d0[6]}, [r2]! + vld1.8 {d0[7]}, [r2]! +26: tst r0, #2 + beq 27f + vld1.8 {d0[2]}, [r2]! + vld1.8 {d0[3]}, [r2]! +27: tst r0, #1 + beq 28f + vld1.8 {d0[1]}, [r2]! +28: tst r0, #16 + beq 29f + vst1.8 {d2-d3}, [r1]! +29: tst r0, #8 + beq 30f + vst1.8 {d1}, [r1]! +30: tst r0, #4 + beq 31f + vst1.8 {d0[4]}, [r1]! + vst1.8 {d0[5]}, [r1]! + vst1.8 {d0[6]}, [r1]! + vst1.8 {d0[7]}, [r1]! +31: tst r0, #2 + beq 32f + vst1.8 {d0[2]}, [r1]! + vst1.8 {d0[3]}, [r1]! +32: tst r0, #1 + beq 23f + vst1.8 {d0[1]}, [r1]! +23: bx lr +.endfunc + +/******************************************************************************/ + +/* + * Helper macro for memcpy function, it can copy data from source (r1) to + * destination (r0) buffers fixing alignment in the process. Destination + * buffer should be aligned already (4 bytes alignment is required. + * Size of the block to copy is in r2 register + */ +.macro UNALIGNED_MEMCPY shift + sub r1, #(\shift) + ldr ip, [r1], #4 + + tst r0, #4 + movne r3, ip, lsr #(\shift * 8) + ldrne ip, [r1], #4 + subne r2, r2, #4 + orrne r3, r3, ip, asl #(32 - \shift * 8) + strne r3, [r0], #4 + + tst r0, #8 + movne r3, ip, lsr #(\shift * 8) + ldmiane r1!, {r4, ip} + subne r2, r2, #8 + orrne r3, r3, r4, asl #(32 - \shift * 8) + movne r4, r4, lsr #(\shift * 8) + orrne r4, r4, ip, asl #(32 - \shift * 8) + stmiane r0!, {r3-r4} + cmp r2, #32 + blt 3f + pld [r1, #48] + stmfd sp!, {r7, r8, r9, r10, r11} + add r3, r1, #128 + bic r3, r3, #31 + sub r9, r3, r1 +1: + pld [r1, r9] + subs r2, r2, #32 + movge r3, ip, lsr #(\shift * 8) + ldmiage r1!, {r4-r6, r7, r8, r10, r11, ip} + orrge r3, r3, r4, asl #(32 - \shift * 8) + movge r4, r4, lsr #(\shift * 8) + orrge r4, r4, r5, asl #(32 - \shift * 8) + movge r5, r5, lsr #(\shift * 8) + orrge r5, r5, r6, asl #(32 - \shift * 8) + movge r6, r6, lsr #(\shift * 8) + orrge r6, r6, r7, asl #(32 - \shift * 8) + stmiage r0!, {r3-r6} + movge r7, r7, lsr #(\shift * 8) + orrge r7, r7, r8, asl #(32 - \shift * 8) + movge r8, r8, lsr #(\shift * 8) + orrge r8, r8, r10, asl #(32 - \shift * 8) + movge r10, r10, lsr #(\shift * 8) + orrge r10, r10, r11, asl #(32 - \shift * 8) + movge r11, r11, lsr #(\shift * 8) + orrge r11, r11, ip, asl #(32 - \shift * 8) + stmiage r0!, {r7, r8, r10, r11} + bgt 1b +2: + ldmfd sp!, {r7, r8, r9, r10, r11} +3: /* copy remaining data */ + tst r2, #16 + movne r3, ip, lsr #(\shift * 8) + ldmiane r1!, {r4-r6, ip} + orrne r3, r3, r4, asl #(32 - \shift * 8) + movne r4, r4, lsr #(\shift * 8) + orrne r4, r4, r5, asl #(32 - \shift * 8) + movge r5, r5, lsr #(\shift * 8) + orrge r5, r5, r6, asl #(32 - \shift * 8) + movge r6, r6, lsr #(\shift * 8) + orrge r6, r6, ip, asl #(32 - \shift * 8) + stmiane r0!, {r3-r6} + + tst r2, #8 + movne r3, ip, lsr #(\shift * 8) + ldmiane r1!, {r4, ip} + orrne r3, r3, r4, asl #(32 - \shift * 8) + movne r4, r4, lsr #(\shift * 8) + orrne r4, r4, ip, asl #(32 - \shift * 8) + stmiane r0!, {r3-r4} + + tst r2, #4 + movne r3, ip, lsr #(\shift * 8) + ldrne ip, [r1], #4 + sub r1, r1, #(4 - \shift) + orrne r3, r3, ip, asl #(32 - \shift * 8) + strne r3, [r0], #4 + + tst r2, #2 + ldrbne r3, [r1], #1 + ldrbne r4, [r1], #1 + ldr r5, [sp], #4 + strbne r3, [r0], #1 + strbne r4, [r0], #1 + + tst r2, #1 + ldrbne r3, [r1], #1 + ldr r6, [sp], #4 + strbne r3, [r0], #1 + + ldmfd sp!, {r0, r4} + + bx lr +.endm + +/* + * Memcpy function with Raspberry Pi specific aligned prefetch, based on + * https://garage.maemo.org/plugins/scmsvn/viewcvs.php/mplayer/trunk/fastmem-arm9/fastmem-arm9.S + */ +asm_function memcpy_armv5te + cmp r2, #20 + blt 9f + /* copy data until destination address is 4 bytes aligned */ + tst r0, #1 + ldrbne r3, [r1], #1 + stmfd sp!, {r0, r4} + subne r2, r2, #1 + strbne r3, [r0], #1 + tst r0, #2 + ldrbne r3, [r1], #1 + ldrbne r4, [r1], #1 + stmfd sp!, {r5, r6} + subne r2, r2, #2 + orrne r3, r3, r4, asl #8 + strhne r3, [r0], #2 + /* destination address is 4 bytes aligned */ + /* now we should handle 4 cases of source address alignment */ + tst r1, #1 + bne 6f + tst r1, #2 + bne 7f + + /* both source and destination are 4 bytes aligned */ + stmfd sp!, {r7, r8, r9, r10, r11} + tst r0, #4 + ldrne r4, [r1], #4 + subne r2, r2, #4 + strne r4, [r0], #4 + tst r0, #8 + ldmiane r1!, {r3-r4} + add r9, r1, #96 + subne r2, r2, #8 + bic r9, r9, #31 + stmiane r0!, {r3-r4} + sub r9, r9, r1 +1: + subs r2, r2, #32 + ldmiage r1!, {r3-r6, r7, r8, r10, r11} + pld [r1, r9] + stmiage r0!, {r3-r6} + stmiage r0!, {r7, r8, r10, r11} + bgt 1b +2: + ldmfd sp!, {r7, r8, r9, r10, r11} + tst r2, #16 + ldmiane r1!, {r3-r6} + stmiane r0!, {r3-r6} + tst r2, #8 + ldmiane r1!, {r3-r4} + stmiane r0!, {r3-r4} + tst r2, #4 + ldrne r3, [r1], #4 + mov ip, r0 + strne r3, [ip], #4 + tst r2, #2 + ldrhne r3, [r1], #2 + ldmfd sp!, {r5, r6} + strhne r3, [ip], #2 + tst r2, #1 + ldrbne r3, [r1], #1 + ldmfd sp!, {r0, r4} + strbne r3, [ip], #1 + + bx lr + +6: + tst r1, #2 + bne 8f + UNALIGNED_MEMCPY 1 +7: + UNALIGNED_MEMCPY 2 +8: + UNALIGNED_MEMCPY 3 +9: + stmfd sp!, {r0, r4} +1: subs r2, r2, #3 + ldrbge ip, [r0] + ldrbge r3, [r1], #1 + ldrbge r4, [r1], #1 + ldrbge ip, [r1], #1 + strbge r3, [r0], #1 + strbge r4, [r0], #1 + strbge ip, [r0], #1 + bge 1b + adds r2, r2, #2 + ldrbge r3, [r1], #1 + mov ip, r0 + ldr r0, [sp], #4 + strbge r3, [ip], #1 + ldrbgt r3, [r1], #1 + ldr r4, [sp], #4 + strbgt r3, [ip], #1 + bx lr +.endfunc + +/******************************************************************************/ + +/* + * aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, void *fbmem) + * + * Both 'scratch' and 'fbmem' pointers must be 32 bytes aligned. + * The value in 'numbytes' is also rounded up to a multiple of 32 bytes. + + * The only purpose of this code is to attempt minimizing penalty incured + * by doing uncached reads from memory (for example framebuffer). We are + * trying to do the largest possible perfectly aligned reads to fetch + * data into a temporary scratch buffer in L1 cache. + */ + +asm_function aligned_fetch_fbmem_to_scratch_neon + SIZE .req r0 + DST .req r1 + SRC .req r2 + + subs SIZE, #128 + blt 1f +0: + /* aligned load from the source (framebuffer) */ + vld1.64 {q0, q1}, [SRC, :256]! + vld1.64 {q2, q3}, [SRC, :256]! + vld1.64 {q8, q9}, [SRC, :256]! + vld1.64 {q10, q11}, [SRC, :256]! + /* fetch destination (scratch buffer) into L1 cache */ + ldr r3, [DST] + ldr ip, [DST, #64] + /* aligned store to the scratch buffer */ + vst1.64 {q0, q1}, [DST, :256]! + vst1.64 {q2, q3}, [DST, :256]! + vst1.64 {q8, q9}, [DST, :256]! + vst1.64 {q10, q11}, [DST, :256]! + subs SIZE, SIZE, #128 + bge 0b +1: + tst SIZE, #64 + beq 1f + vld1.64 {q0, q1}, [SRC, :256]! + vld1.64 {q2, q3}, [SRC, :256]! + ldr r3, [DST] + vst1.64 {q0, q1}, [DST, :256]! + vst1.64 {q2, q3}, [DST, :256]! +1: + tst SIZE, #32 + beq 1f + vld1.64 {q0, q1}, [SRC, :256]! + vst1.64 {q0, q1}, [DST, :256]! +1: + tst SIZE, #31 + beq 1f + vld1.64 {q0, q1}, [SRC, :256]! + vst1.64 {q0, q1}, [DST, :256]! +1: + bx lr + + .unreq SIZE + .unreq DST + .unreq SRC +.endfunc + +asm_function aligned_fetch_fbmem_to_scratch_vfp + SIZE .req r0 + DST .req r1 + SRC .req r2 + + vpush {d8-d15} + subs SIZE, #128 + blt 1f +0: + /* aligned load from the source (framebuffer) */ + vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15} + /* aligned store to the scratch buffer */ + vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15} + subs SIZE, SIZE, #128 + bge 0b +1: + tst SIZE, #64 + beq 1f + vldm SRC!, {d0, d1, d2, d3, d4, d5, d6, d7} + vstm DST!, {d0, d1, d2, d3, d4, d5, d6, d7} +1: + tst SIZE, #32 + beq 1f + vldm SRC!, {d0, d1, d2, d3} + vstm DST!, {d0, d1, d2, d3} +1: + tst SIZE, #31 + beq 1f + vldm SRC!, {d0, d1, d2, d3} + vstm DST!, {d0, d1, d2, d3} +1: + vpop {d8-d15} + bx lr + + .unreq SIZE + .unreq DST + .unreq SRC +.endfunc + +asm_function aligned_fetch_fbmem_to_scratch_arm + SIZE .req r0 + DST .req r1 + SRC .req r2 + + push {r4-r11, lr} + subs SIZE, #128 + blt 1f +0: + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} + subs SIZE, SIZE, #128 + bge 0b +1: + tst SIZE, #64 + beq 1f + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} +1: + tst SIZE, #32 + beq 1f + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} +1: + tst SIZE, #31 + beq 1f + ldmia SRC!, {r4-r11} + stmia DST!, {r4-r11} +1: + pop {r4-r11, pc} + + .unreq SIZE + .unreq DST + .unreq SRC +.endfunc + +#endif diff --git a/tests/test_uncached.c b/tests/test_uncached.c new file mode 100644 index 0000000..f1fd070 --- /dev/null +++ b/tests/test_uncached.c @@ -0,0 +1,220 @@ +#include +#include +#include +#include +#include + +#include + +void aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, const void *fbmem); +void aligned_fetch_fbmem_to_scratch_vfp (int numbytes, void *scratch, const void *fbmem); + +#define TEST_SIZE (1024 * 1024) +#define TRIES 16 + +static char dummy [TEST_SIZE] __attribute__((aligned(64))); +static char dummy2[TEST_SIZE] __attribute__((aligned(64))); + +static inline void pcnt_init(void) +{ + int v; + asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v)); + v |= 5; // master enable, ccnt reset + v &= ~8; // ccnt divider 0 + asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v)); + // enable cycle counter + asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31)); +} + +static inline unsigned int pcnt_get(void) +{ + unsigned int val; + __asm__ volatile("mrc p15, 0, %0, c9, c13, 0" + : "=r"(val)); + return val; +} + +#define make_rd_test(name, type) \ +static int name(const void *mem_, size_t size) \ +{ \ + const type *mem = mem_; \ + int sum = 0; \ + \ + size /= sizeof(*mem); \ + while (size-- > 0) \ + sum += *mem++; \ + \ + return sum; \ +} + +make_rd_test(read_c_8, int8_t) +make_rd_test(read_c_16, int16_t) +make_rd_test(read_c_32, int32_t) +make_rd_test(read_c_64, int64_t) + +static int read_ldrd(const void *mem, size_t size) +{ + size /= 8; + asm volatile( + "0: ldrd r2, r3, [%0], #8\n" + " subs %1, #1\n" + " bgt 0b\n" + : "=&r"(mem), "=&r"(size) + : "0"(mem), "1"(size) + : "r2", "r3", "cc"); + return 0; +} + +static int read_ldrd_pld(const void *mem, size_t size) +{ + size /= 8; + asm volatile( + "0: ldrd r2, r3, [%0], #8\n" + " subs %1, #1\n" + " pld [%0, #64*4]\n" + " bgt 0b\n" + : "=&r"(mem), "=&r"(size) + : "0"(mem), "1"(size) + : "r2", "r3", "cc"); + return 0; +} + +static int g_skip; +static int read_c_32_skip(const void *mem_, size_t size) +{ + const int *mem = mem_; + int skip = g_skip / 4; + int sum = 0; + size_t i; + + size /= 4; + for (i = 0; i < size; i += skip) + sum += mem[i]; + + return sum; +} + +static int read_fbt_neon(const void *mem, size_t size) +{ + size_t i; + + for (i = 0; i < size; i += 256) + aligned_fetch_fbmem_to_scratch_neon(256, dummy2, mem + i); + + return 0; +} + +static int read_fbt_vfp(const void *mem, size_t size) +{ + size_t i; + + for (i = 0; i < size; i += 256) + aligned_fetch_fbmem_to_scratch_vfp(256, dummy2, mem + i); + + return 0; +} + +static unsigned int run(const char *name, + void (*inv)(const void *mem, size_t size), + int (*test)(const void *mem, size_t size), + const void *mem, unsigned int baseline) +{ + unsigned int i, cycles, smallest = ~0; + + for (i = 0; i < TRIES; i++) { + cycles = pcnt_get(); + if (inv) + inv(mem, TEST_SIZE); + test(mem, TEST_SIZE); + cycles = pcnt_get() - cycles; + + if (cycles < smallest) + smallest = cycles; + } + + printf("%-10s %6uk", name, smallest / 1000); + if (baseline != 0) + printf(" %5lld%%", smallest * 100ull / baseline); + printf("\n"); + + return smallest; +} + +static void run_all(const char *name, + void (*inv)(const void *mem, size_t size), + const void *mem) +{ + static unsigned int b[16]; + unsigned int r[16]; + int t = 0; + + printf("%s\n", name); + r[t] = run(" 8", inv, read_c_8, mem, b[t]); t++; + r[t] = run(" 16", inv, read_c_16, mem, b[t]); t++; + r[t] = run(" 32", inv, read_c_32, mem, b[t]); t++; + r[t] = run(" 64", inv, read_c_64, mem, b[t]); t++; + g_skip = 32; + r[t] = run(" 32_s32", inv, read_c_32_skip, mem, b[t]); t++; + g_skip = 64; + r[t] = run(" 32_s64", inv, read_c_32_skip, mem, b[t]); t++; + r[t] = run(" ldrd", inv, read_ldrd, mem, b[t]); t++; + r[t] = run(" ldrd pld", inv, read_ldrd_pld, mem, b[t]); t++; + r[t] = run(" fbt neon", inv, read_fbt_neon, mem, b[t]); t++; + r[t] = run(" fbt vfp", inv, read_fbt_vfp, mem, b[t]); t++; + + if (b[0] == 0) + memcpy(b, r, sizeof(b)); +} + +static void shm_inv(const void *mem, size_t size) +{ + dsp_cache_inv_virt((void *)mem, size); +} + +static void run_shm(const char *name, dsp_cache_t ct, int use_inv) +{ + dsp_mem_region_t region; + void *mem; + + region = dsp_shm_alloc(ct, TEST_SIZE); + if (region.size < TEST_SIZE || region.virt_addr == 0) { + fprintf(stderr, "dsp_shm_alloc failed\n"); + return; + } + mem = (void *)region.virt_addr; + // printf("mapped %d %p\n", ct, mem); + + run_all(name, use_inv ? shm_inv : NULL, mem); + + dsp_shm_free(region); +} + +int main() +{ + int ret; + + // prefault + memset(dummy, 1, sizeof(dummy)); + memset(dummy2, 1, sizeof(dummy2)); + printf("pid: %d dummy: %p\n", (int)getpid(), dummy); + + pcnt_init(); + + run_all(".bss", NULL, dummy); + + ret = dsp_open(); + if (ret != 0) { + fprintf(stderr, "dsp_open %d\n", ret); + return 1; + } + + run_shm("shm wb", DSP_CACHE_RW, 0); + //run_shm("shm wt", DSP_CACHE_R, 0); + run_shm("shm nc", DSP_CACHE_W, 0); + //run_shm("shm wb inv", DSP_CACHE_RW, 1); + run_shm("shm wt inv", DSP_CACHE_R, 1); + + dsp_close(); + + return 0; +}