--- /dev/null
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+#define SIZE (10 * 1024 * 1024)
+
+static inline void pcnt_init(void)
+{
+ int v;
+ asm volatile("mrc p15, 0, %0, c9, c12, 0" : "=r"(v));
+ v |= 5; // master enable, ccnt reset
+ v &= ~8; // ccnt divider 0
+ asm volatile("mcr p15, 0, %0, c9, c12, 0" :: "r"(v));
+ // enable cycle counter
+ asm volatile("mcr p15, 0, %0, c9, c12, 1" :: "r"(1<<31));
+}
+
+static inline unsigned int pcnt_get(void)
+{
+ unsigned int val;
+ __asm__ volatile("mrc p15, 0, %0, c9, c13, 0"
+ : "=r"(val));
+ return val;
+}
+
+extern void do_ldr64(void *mem, int bytes);
+extern void do_ldr64pld(void *mem, int bytes);
+extern void do_ldr32(void *mem, int bytes);
+extern void do_ldr32pld(void *mem, int bytes);
+extern void do_ldr4(void *mem, int bytes);
+extern void do_ldr4pld(void *mem, int bytes);
+extern void do_ldr4pld2(void *mem, int bytes);
+extern void do_ldr1(void *mem, int bytes);
+extern void do_ldr1pld(void *mem, int bytes);
+extern void do_ldr1ldr(void *mem, int bytes);
+extern void do_ldr1pld2(void *mem, int bytes);
+extern void do_pld64(void *mem, int bytes);
+asm(
+".align 3\n"
+"do_ldr64:\n"
+" ldr r3, [r0], #64\n"
+" subs r1, #64\n"
+" bgt do_ldr64\n"
+" nop\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr64pld:\n"
+" ldr r3, [r0], #64\n"
+" subs r1, #64\n"
+" pld [r0, #64*3]\n"
+" bgt do_ldr64pld\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr32:\n"
+" ldr r3, [r0], #32\n"
+" subs r1, #32\n"
+" bgt do_ldr32\n"
+" nop\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr32pld:\n"
+" ldr r3, [r0], #32\n"
+" subs r1, #32\n"
+" pld [r0, #64*3]\n"
+" bgt do_ldr32pld\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr4:\n"
+" ldr r3, [r0], #4\n"
+" subs r1, #4\n"
+" bgt do_ldr4\n"
+" nop\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr4pld:\n"
+" ldr r3, [r0], #4\n"
+" subs r1, #4\n"
+" pld [r0, #64*2]\n"
+" bgt do_ldr4pld\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr4pld2:\n"
+" ldr r3, [r0]\n"
+" sub r1, #4\n"
+" tst r0, #63\n"
+" bne do_ldr4pld2\n"
+" pld [r0, #64*3]\n"
+" tst r1, r1\n"
+" bne do_ldr4pld2\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr1:\n"
+" ldrb r3, [r0], #1\n"
+" subs r1, #1\n"
+" bgt do_ldr1\n"
+" nop\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr1pld:\n"
+" ldrb r3, [r0], #1\n"
+" subs r1, #1\n"
+" pld [r0, #64*2]\n"
+" bgt do_ldr1pld\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr1ldr:\n"
+" sub r1, #64*2+1\n"
+" nop\n" // important!
+"0:\n"
+" ldrb r3, [r0], #1\n"
+" subs r1, #1\n"
+" ldrb r12,[r0, #64*2]\n"
+" bgt 0b\n"
+" bx lr\n"
+
+".align 3\n"
+"do_ldr1pld2:\n"
+" ldrb r3, [r0], #1\n"
+" sub r1, #1\n"
+" tst r0, #63\n"
+" bne do_ldr1pld2\n"
+" tst r1, r1\n"
+" pld [r0, #64*2]\n"
+" bne do_ldr1pld2\n"
+" bx lr\n"
+
+".align 3\n"
+"do_pld64:\n"
+" pld [r0]\n"
+" add r0, #64\n"
+" subs r1, #64\n"
+" bgt do_pld64\n"
+" nop\n"
+" bx lr\n"
+);
+
+#define T(n) { #n, do_##n }
+static const struct {
+ const char *name;
+ void (*f)(void *mem, int bytes);
+} tests[] = {
+ T(ldr64),
+ T(ldr64pld),
+// T(ldr32),
+// T(ldr32pld),
+ T(ldr4),
+ T(ldr4pld),
+ T(ldr4pld2),
+ T(ldr1),
+ T(ldr1pld),
+ T(ldr1ldr),
+ T(ldr1pld2),
+ T(pld64),
+};
+
+int main()
+{
+ unsigned int cnt, min;
+ void *mem;
+ int t, i;
+
+ pcnt_init();
+
+ mem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0);
+ if (mem == MAP_FAILED) {
+ perror("mmap");
+ return 1;
+ }
+ //memset(mem, 1, SIZE);
+
+ for (t = 0; t < sizeof(tests) / sizeof(tests[0]); t++) {
+ min = ~0;
+ for (i = 0; i < 8; i++) {
+ cnt = pcnt_get();
+ tests[t].f(mem, SIZE);
+ cnt = pcnt_get() - cnt;
+ if (min > cnt)
+ min = cnt;
+ }
+ printf("%-8s %5.1f\n", tests[t].name, min / 1000000.0f);
+ }
+
+ munmap(mem, SIZE);
+
+ return 0;
+}