.fpu neon .text .global memcpy_neon .func memcpy_neon memcpy_neon: push {r4-r11} mov r3, r0 1: pld [r1, #192] pld [r1, #256] vld1.64 {d0-d3}, [r1,:128]! vld1.64 {d4-d7}, [r1,:128]! vld1.64 {d16-d19}, [r1,:128]! ldm r1!, {r4-r11} subs r2, r2, #128 vst1.64 {d0-d3}, [r3,:128]! vst1.64 {d4-d7}, [r3,:128]! vst1.64 {d16-d19}, [r3,:128]! stm r3!, {r4-r11} bgt 1b pop {r4-r11} bx lr .endfunc