--- /dev/null
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <inc_libc64_mini.h>
+
+void aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, const void *fbmem);
+void aligned_fetch_fbmem_to_scratch_vfp (int numbytes, void *scratch, const void *fbmem);
+
+#define TEST_SIZE (1024 * 1024)
+#define TRIES 16
+
+static char dummy [TEST_SIZE] __attribute__((aligned(64)));
+static char dummy2[TEST_SIZE] __attribute__((aligned(64)));
+
+static inline void pcnt_init(void)
+{
+ int v;
+ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v));
+ v |= 5; // master enable, ccnt reset
+ v &= ~8; // ccnt divider 0
+ asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v));
+ // enable cycle counter
+ asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31));
+}
+
+static inline unsigned int pcnt_get(void)
+{
+ unsigned int val;
+ __asm__ volatile("mrc p15, 0, %0, c9, c13, 0"
+ : "=r"(val));
+ return val;
+}
+
+#define make_rd_test(name, type) \
+static int name(const void *mem_, size_t size) \
+{ \
+ const type *mem = mem_; \
+ int sum = 0; \
+ \
+ size /= sizeof(*mem); \
+ while (size-- > 0) \
+ sum += *mem++; \
+ \
+ return sum; \
+}
+
+make_rd_test(read_c_8, int8_t)
+make_rd_test(read_c_16, int16_t)
+make_rd_test(read_c_32, int32_t)
+make_rd_test(read_c_64, int64_t)
+
+static int read_ldrd(const void *mem, size_t size)
+{
+ size /= 8;
+ asm volatile(
+ "0: ldrd r2, r3, [%0], #8\n"
+ " subs %1, #1\n"
+ " bgt 0b\n"
+ : "=&r"(mem), "=&r"(size)
+ : "0"(mem), "1"(size)
+ : "r2", "r3", "cc");
+ return 0;
+}
+
+static int read_ldrd_pld(const void *mem, size_t size)
+{
+ size /= 8;
+ asm volatile(
+ "0: ldrd r2, r3, [%0], #8\n"
+ " subs %1, #1\n"
+ " pld [%0, #64*4]\n"
+ " bgt 0b\n"
+ : "=&r"(mem), "=&r"(size)
+ : "0"(mem), "1"(size)
+ : "r2", "r3", "cc");
+ return 0;
+}
+
+static int g_skip;
+static int read_c_32_skip(const void *mem_, size_t size)
+{
+ const int *mem = mem_;
+ int skip = g_skip / 4;
+ int sum = 0;
+ size_t i;
+
+ size /= 4;
+ for (i = 0; i < size; i += skip)
+ sum += mem[i];
+
+ return sum;
+}
+
+static int read_fbt_neon(const void *mem, size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i += 256)
+ aligned_fetch_fbmem_to_scratch_neon(256, dummy2, mem + i);
+
+ return 0;
+}
+
+static int read_fbt_vfp(const void *mem, size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i += 256)
+ aligned_fetch_fbmem_to_scratch_vfp(256, dummy2, mem + i);
+
+ return 0;
+}
+
+static unsigned int run(const char *name,
+ void (*inv)(const void *mem, size_t size),
+ int (*test)(const void *mem, size_t size),
+ const void *mem, unsigned int baseline)
+{
+ unsigned int i, cycles, smallest = ~0;
+
+ for (i = 0; i < TRIES; i++) {
+ cycles = pcnt_get();
+ if (inv)
+ inv(mem, TEST_SIZE);
+ test(mem, TEST_SIZE);
+ cycles = pcnt_get() - cycles;
+
+ if (cycles < smallest)
+ smallest = cycles;
+ }
+
+ printf("%-10s %6uk", name, smallest / 1000);
+ if (baseline != 0)
+ printf(" %5lld%%", smallest * 100ull / baseline);
+ printf("\n");
+
+ return smallest;
+}
+
+static void run_all(const char *name,
+ void (*inv)(const void *mem, size_t size),
+ const void *mem)
+{
+ static unsigned int b[16];
+ unsigned int r[16];
+ int t = 0;
+
+ printf("%s\n", name);
+ r[t] = run(" 8", inv, read_c_8, mem, b[t]); t++;
+ r[t] = run(" 16", inv, read_c_16, mem, b[t]); t++;
+ r[t] = run(" 32", inv, read_c_32, mem, b[t]); t++;
+ r[t] = run(" 64", inv, read_c_64, mem, b[t]); t++;
+ g_skip = 32;
+ r[t] = run(" 32_s32", inv, read_c_32_skip, mem, b[t]); t++;
+ g_skip = 64;
+ r[t] = run(" 32_s64", inv, read_c_32_skip, mem, b[t]); t++;
+ r[t] = run(" ldrd", inv, read_ldrd, mem, b[t]); t++;
+ r[t] = run(" ldrd pld", inv, read_ldrd_pld, mem, b[t]); t++;
+ r[t] = run(" fbt neon", inv, read_fbt_neon, mem, b[t]); t++;
+ r[t] = run(" fbt vfp", inv, read_fbt_vfp, mem, b[t]); t++;
+
+ if (b[0] == 0)
+ memcpy(b, r, sizeof(b));
+}
+
+static void shm_inv(const void *mem, size_t size)
+{
+ dsp_cache_inv_virt((void *)mem, size);
+}
+
+static void run_shm(const char *name, dsp_cache_t ct, int use_inv)
+{
+ dsp_mem_region_t region;
+ void *mem;
+
+ region = dsp_shm_alloc(ct, TEST_SIZE);
+ if (region.size < TEST_SIZE || region.virt_addr == 0) {
+ fprintf(stderr, "dsp_shm_alloc failed\n");
+ return;
+ }
+ mem = (void *)region.virt_addr;
+ // printf("mapped %d %p\n", ct, mem);
+
+ run_all(name, use_inv ? shm_inv : NULL, mem);
+
+ dsp_shm_free(region);
+}
+
+int main()
+{
+ int ret;
+
+ // prefault
+ memset(dummy, 1, sizeof(dummy));
+ memset(dummy2, 1, sizeof(dummy2));
+ printf("pid: %d dummy: %p\n", (int)getpid(), dummy);
+
+ pcnt_init();
+
+ run_all(".bss", NULL, dummy);
+
+ ret = dsp_open();
+ if (ret != 0) {
+ fprintf(stderr, "dsp_open %d\n", ret);
+ return 1;
+ }
+
+ run_shm("shm wb", DSP_CACHE_RW, 0);
+ //run_shm("shm wt", DSP_CACHE_R, 0);
+ run_shm("shm nc", DSP_CACHE_W, 0);
+ //run_shm("shm wb inv", DSP_CACHE_RW, 1);
+ run_shm("shm wt inv", DSP_CACHE_R, 1);
+
+ dsp_close();
+
+ return 0;
+}