--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <malloc.h>
+#include <sys/time.h>
+
+extern void *memcpy_neon(void *dst, const void *src, size_t size);
+
+#define BUFSIZE (8*1024*1024)
+#define ITER 128
+
+static unsigned
+tv_diff(struct timeval *tv1, struct timeval *tv2)
+{
+ return (tv2->tv_sec - tv1->tv_sec) * 1000000 +
+ (tv2->tv_usec - tv1->tv_usec);
+}
+
+static void do_test(const char *name, void *p1, void *p2, size_t size,
+ void *(*cpy)(void *, const void *, size_t))
+{
+ struct timeval t1, t2;
+ int i;
+
+ gettimeofday(&t1, NULL);
+ for (i = 0; i < ITER; i++)
+ cpy(p1, p2, size);
+ gettimeofday(&t2, NULL);
+
+ printf("%-8s %llu B/s\n", name,
+ (uint64_t)size * ITER * 1000000 / tv_diff(&t1, &t2));
+}
+
+static void *int32_cpy(void *dst, const void *src, size_t size)
+{
+ const uint32_t *s = src;
+ uint32_t *d = dst;
+ int i;
+
+ size /= 4;
+
+ for (i = 0; i < size; i++)
+ d[i] = s[i];
+
+ return dst;
+}
+
+static void *vec_cpy(void *dst, const void *src, size_t size)
+{
+ typedef int v4si __attribute__ ((vector_size(16)));
+ const v4si *s = src;
+ v4si *d = dst;
+
+ size /= 16;
+
+ while (size--)
+ *d++ = *s++;
+
+ return dst;
+}
+
+int main(int argc, char **argv)
+{
+ void *buf1, *buf2;
+
+ buf1 = memalign(64, BUFSIZE);
+ buf2 = memalign(64, BUFSIZE);
+
+ memset(buf2, 0, BUFSIZE);
+
+ do_test("memcpy", buf1, buf2, BUFSIZE, memcpy);
+ do_test("INT32", buf1, buf2, BUFSIZE, int32_cpy);
+ do_test("C SIMD", buf1, buf2, BUFSIZE, vec_cpy);
+ do_test("ASM SIMD", buf1, buf2, BUFSIZE, memcpy_neon);
+
+ free(buf1);
+ free(buf2);
+
+ return 0;
+}