7 #include <inc_libc64_mini.h>
9 void aligned_fetch_fbmem_to_scratch_neon(int numbytes, void *scratch, const void *fbmem);
10 void aligned_fetch_fbmem_to_scratch_vfp (int numbytes, void *scratch, const void *fbmem);
12 #define TEST_SIZE (1024 * 1024)
15 static char dummy [TEST_SIZE] __attribute__((aligned(64)));
16 static char dummy2[TEST_SIZE] __attribute__((aligned(64)));
18 static inline void pcnt_init(void)
21 asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v));
22 v |= 5; // master enable, ccnt reset
23 v &= ~8; // ccnt divider 0
24 asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v));
25 // enable cycle counter
26 asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31));
29 static inline unsigned int pcnt_get(void)
32 __asm__ volatile("mrc p15, 0, %0, c9, c13, 0"
37 #define make_rd_test(name, type) \
38 static int name(const void *mem_, size_t size) \
40 const type *mem = mem_; \
43 size /= sizeof(*mem); \
50 make_rd_test(read_c_8, int8_t)
51 make_rd_test(read_c_16, int16_t)
52 make_rd_test(read_c_32, int32_t)
53 make_rd_test(read_c_64, int64_t)
55 static int read_ldrd(const void *mem, size_t size)
59 "0: ldrd r2, r3, [%0], #8\n"
62 : "=&r"(mem), "=&r"(size)
68 static int read_ldrd_pld(const void *mem, size_t size)
72 "0: ldrd r2, r3, [%0], #8\n"
76 : "=&r"(mem), "=&r"(size)
83 static int read_c_32_skip(const void *mem_, size_t size)
85 const int *mem = mem_;
86 int skip = g_skip / 4;
91 for (i = 0; i < size; i += skip)
97 static int read_fbt_neon(const void *mem, size_t size)
101 for (i = 0; i < size; i += 256)
102 aligned_fetch_fbmem_to_scratch_neon(256, dummy2, mem + i);
107 static int read_fbt_vfp(const void *mem, size_t size)
111 for (i = 0; i < size; i += 256)
112 aligned_fetch_fbmem_to_scratch_vfp(256, dummy2, mem + i);
117 static unsigned int run(const char *name,
118 void (*inv)(const void *mem, size_t size),
119 int (*test)(const void *mem, size_t size),
120 const void *mem, unsigned int baseline)
122 unsigned int i, cycles, smallest = ~0;
124 for (i = 0; i < TRIES; i++) {
128 test(mem, TEST_SIZE);
129 cycles = pcnt_get() - cycles;
131 if (cycles < smallest)
135 printf("%-10s %6uk", name, smallest / 1000);
137 printf(" %5lld%%", smallest * 100ull / baseline);
143 static void run_all(const char *name,
144 void (*inv)(const void *mem, size_t size),
147 static unsigned int b[16];
151 printf("%s\n", name);
152 r[t] = run(" 8", inv, read_c_8, mem, b[t]); t++;
153 r[t] = run(" 16", inv, read_c_16, mem, b[t]); t++;
154 r[t] = run(" 32", inv, read_c_32, mem, b[t]); t++;
155 r[t] = run(" 64", inv, read_c_64, mem, b[t]); t++;
157 r[t] = run(" 32_s32", inv, read_c_32_skip, mem, b[t]); t++;
159 r[t] = run(" 32_s64", inv, read_c_32_skip, mem, b[t]); t++;
160 r[t] = run(" ldrd", inv, read_ldrd, mem, b[t]); t++;
161 r[t] = run(" ldrd pld", inv, read_ldrd_pld, mem, b[t]); t++;
162 r[t] = run(" fbt neon", inv, read_fbt_neon, mem, b[t]); t++;
163 r[t] = run(" fbt vfp", inv, read_fbt_vfp, mem, b[t]); t++;
166 memcpy(b, r, sizeof(b));
169 static void shm_inv(const void *mem, size_t size)
171 dsp_cache_inv_virt((void *)mem, size);
174 static void run_shm(const char *name, dsp_cache_t ct, int use_inv)
176 dsp_mem_region_t region;
179 region = dsp_shm_alloc(ct, TEST_SIZE);
180 if (region.size < TEST_SIZE || region.virt_addr == 0) {
181 fprintf(stderr, "dsp_shm_alloc failed\n");
184 mem = (void *)region.virt_addr;
185 // printf("mapped %d %p\n", ct, mem);
187 run_all(name, use_inv ? shm_inv : NULL, mem);
189 dsp_shm_free(region);
197 memset(dummy, 1, sizeof(dummy));
198 memset(dummy2, 1, sizeof(dummy2));
199 printf("pid: %d dummy: %p\n", (int)getpid(), dummy);
203 run_all(".bss", NULL, dummy);
207 fprintf(stderr, "dsp_open %d\n", ret);
211 run_shm("shm wb", DSP_CACHE_RW, 0);
212 //run_shm("shm wt", DSP_CACHE_R, 0);
213 run_shm("shm nc", DSP_CACHE_W, 0);
214 //run_shm("shm wb inv", DSP_CACHE_RW, 1);
215 run_shm("shm wt inv", DSP_CACHE_R, 1);