xf86-video-msm: fix build errors
authorDavid Lanzendörfer <david.lanzendoerfer@o2s.ch>
Wed, 17 Mar 2010 20:25:38 +0000 (21:25 +0100)
committerLukas Gorris <lukas.gorris@gmail.com>
Wed, 17 Mar 2010 20:25:38 +0000 (21:25 +0100)
recipes/xorg-driver/xf86-video-msm/no_neon.patch [new file with mode: 0644]
recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch [new file with mode: 0644]
recipes/xorg-driver/xf86-video-msm/renaming_variables.patch [new file with mode: 0644]
recipes/xorg-driver/xf86-video-msm_git.bb

diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon.patch b/recipes/xorg-driver/xf86-video-msm/no_neon.patch
new file mode 100644 (file)
index 0000000..c0aa92e
--- /dev/null
@@ -0,0 +1,2901 @@
+commit d8910bf773fbecf7cdea359d4b530a3672e27180
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date:   Wed Feb 10 16:18:39 2010 +0100
+
+    Removed neon because its not available in our kerneÃl
+    and so its causing trubble (Illegal instruction)
+
+diff --git git/src/msm-swblits.h git/src/msm-swblits.h
+index f89f00e..a40b24b 100755
+--- git/src/msm-swblits.h
++++ git/src/msm-swblits.h
+@@ -38,16 +38,6 @@
+ #include <stdint.h>
+ #include <stdlib.h>
+-/* Neon intrinsics are part of the ARM or GCC compiler used.                                                              */
+-/* Tested with: /pkg/asw/compilers/gnu/codesourcery/arm-2008q3-72/lib/gcc/arm-none-linux-gnueabi/4.3.2/include/arm_neon.h */
+-#include <arm_neon.h>
+-
+-/* These are NEON-optimized functions linked to by various tests.  */
+-extern void * neon_memcpy (void * dest, const void * source, unsigned int numBytes);
+-extern void * neon_memmove (void * dest, const void * source, unsigned int numBytes);
+-extern void memset16(uint16_t *dst, uint16_t value, int count);
+-extern void memset32(uint32_t *dst, uint32_t value, int count);
+-
+ /* Make definitions to clarify memory-related sizes to enable avoidance of magic numbers. */
+ #define BITS_PER_BYTE (8)
+ #define BYTES_PER_16BPP_PIXEL (2)
+diff --git git/src/msm-swfill.c git/src/msm-swfill.c
+index 108fd94..3dd1ef2 100755
+--- git/src/msm-swfill.c
++++ git/src/msm-swfill.c
+@@ -212,7 +212,7 @@ memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int cou
+    }
+ }
+-
++/*
+ static inline void
+ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+ {
+@@ -333,7 +333,7 @@ memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
+    // Quickly fill remaining pixels (up to 7).
+    memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
+ }
+-
++*/
+ static inline void
+ memset16_Test(uint16_t *dst, uint16_t src, int count)
+@@ -368,7 +368,8 @@ memset16_Test(uint16_t *dst, uint16_t src, int count)
+       // Copy remaining pixels using Neon and non-Neon instructions.
+       // NOTE: This assumes that dst is aligned optimally for Neon instructions.
+-      memset16_AssumesNeonAlignment((void *) dst, src, count);
++      //memset16_AssumesNeonAlignment((void *) dst, src, count);
++      memset((void *) dst, src, count);
+    }
+ }
+@@ -435,12 +436,14 @@ swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dp
+    if (w < 32) {
+       // For narrow rectangles, block signals only once for the entire rectangles.
+       BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
++      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
+       UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+    }
+    else {
+       // For wider rectangles, block and unblock signals for every row.
+-      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++      //DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++      DO_MULTIPLE_FILLS_WITH_MEMSET(memset,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+    }
+ }
+diff --git git/src/msm-swrender.c git/src/msm-swrender.c
+index a7a9abc..835dc03 100755
+--- git/src/msm-swrender.c
++++ git/src/msm-swrender.c
+@@ -214,160 +214,6 @@ swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src
+                   }
+                }
+                break;
+-      case  7: if (xdir >= 0) {
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+-                  return TRUE;
+-               } else {
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
+-                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
+-                  return TRUE;
+-               }
+-               break;
+-      case  8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
+-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
+-                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
+-                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
+-                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
+-                  return TRUE;
+-               }
+-               else {
+-                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3  = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4  = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5  = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6  = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7  = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8  = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
+-                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
+-                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
+-                  *(uint16_t *) (dst+2*BYTES_PER_UINT16_T)  = src3;
+-                  *(uint16_t *) (dst+3*BYTES_PER_UINT16_T)  = src4;
+-                  *(uint16_t *) (dst+4*BYTES_PER_UINT16_T)  = src5;
+-                  *(uint16_t *) (dst+5*BYTES_PER_UINT16_T)  = src6;
+-                  *(uint16_t *) (dst+6*BYTES_PER_UINT16_T)  = src7;
+-                  *(uint16_t *) (dst+7*BYTES_PER_UINT16_T)  = src8;
+-                  return TRUE;
+-               }
+-               break;
+-      case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
+-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
+-                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
+-                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5  = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6  = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7  = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8  = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
+-                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
+-                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
+-                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
+-                  *(uint32_t *) (dst+4*BYTES_PER_UINT32_T)  = src5;
+-                  *(uint32_t *) (dst+5*BYTES_PER_UINT32_T)  = src6;
+-                  *(uint32_t *) (dst+6*BYTES_PER_UINT32_T)  = src7;
+-                  *(uint32_t *) (dst+7*BYTES_PER_UINT32_T)  = src8;
+-                  return TRUE;
+-               }
+-               else {
+-                  // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
+-                  // Instead, just call multiple fixed functions.
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
+-                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+-                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+-                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
+-                  uint64_t src5  = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
+-                  uint64_t src6  = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
+-                  uint64_t src7  = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
+-                  uint64_t src8  = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
+-                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
+-                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
+-                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
+-                  *(uint64_t *) (dst+4*BYTES_PER_UINT64_T)  = src5;
+-                  *(uint64_t *) (dst+5*BYTES_PER_UINT64_T)  = src6;
+-                  *(uint64_t *) (dst+6*BYTES_PER_UINT64_T)  = src7;
+-                  *(uint64_t *) (dst+7*BYTES_PER_UINT64_T)  = src8;
+-                  return TRUE;
+-               }
+-               break;
+-      case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
+-                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
+-                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
+-                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
+-                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
+-                  vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
+-                  vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
+-                  vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
+-                  vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
+-                  return TRUE;
+-               }
+-               break;
+    }
+    return FALSE;
+@@ -519,427 +365,6 @@ swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *sr
+                }
+                return TRUE;
+                break;
+-      case  7: if (xdir >= 0) {
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-               } else {
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-               }
+-               return TRUE;
+-               break;
+-      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
+-                  return TRUE;
+-               }
+-               else {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
+-                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
+-                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
+-                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
+-                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
+-                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
+-                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
+-                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
+-                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
+-                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
+-                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
+-                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
+-                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
+-                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
+-                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
+-                  return TRUE;
+-               }
+-               break;
+-      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
+-                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
+-                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
+-                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
+-                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
+-                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
+-                  return TRUE;
+-               }
+-               else {
+-                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
+-                  // Instead, just call multiple fixed functions.
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
+-                   }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
+-                  uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
+-                  uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
+-                  uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
+-                  uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
+-                  uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
+-                  uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T)  = src5a;
+-                  *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T)  = src6a;
+-                  *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T)  = src7a;
+-                  *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T)  = src8a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T)  = src5b;
+-                  *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T)  = src6b;
+-                  *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T)  = src7b;
+-                  *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T)  = src8b;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b);
+-                  return TRUE;
+-               }//HERE
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+    }
+    return FALSE;
+@@ -1161,484 +586,7 @@ swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *sr
+                }
+                return TRUE;
+                break;
+-      case  7: if (xdir >= 0) {
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-               } else {
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
+-                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
+-               }
+-               return TRUE;
+-               break;
+-      // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons?
+-      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases...
+-      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
+-                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
+-                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
+-                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
+-                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
+-                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
+-                  return TRUE;
+-               }
+-               else {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T);
+-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
+-                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
+-                  uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
+-                  uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
+-                  uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T);
+-                  uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T);
+-                  uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T);
+-                  uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T);
+-                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
+-                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
+-                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
+-                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
+-                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
+-                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
+-                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
+-                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
+-                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
+-                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
+-                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
+-                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
+-                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
+-                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
+-                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
+-                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
+-                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
+-                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
+-                  *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T)  = src3c;
+-                  *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T)  = src4c;
+-                  *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T)  = src5c;
+-                  *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T)  = src6c;
+-                  *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T)  = src7c;
+-                  *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T)  = src8c;
+-                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
+-                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
+-                  *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T)  = src3d;
+-                  *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T)  = src4d;
+-                  *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T)  = src5d;
+-                  *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T)  = src6d;
+-                  *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T)  = src7d;
+-                  *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T)  = src8d;
+-                  return TRUE;
+-               }
+-               break;
+-      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T);
+-                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
+-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
+-                  uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T);
+-                  uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T);
+-                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
+-                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
+-                  *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T)  = src3c;
+-                  *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T)  = src4c;
+-                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
+-                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
+-                  *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T)  = src3d;
+-                  *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T)  = src4d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
+-                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
+-                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
+-                  uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
+-                  uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
+-                  uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
+-                  uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
+-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
+-                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
+-                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
+-                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
+-                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
+-                  *(uint32_t *) (dst+2*dpitch+0)                                        = src1c;
+-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2c;
+-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3c;
+-                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5c;
+-                  *(uint32_t *) (dst+3*dpitch+0)                                        = src1d;
+-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2d;
+-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3d;
+-                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T);
+-                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
+-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T);
+-                  uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T);
+-                  uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T);
+-                  uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T);
+-                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
+-                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
+-                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
+-                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
+-                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
+-                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
+-                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T)  = src5c;
+-                  *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T)  = src6c;
+-                  *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T)  = src7c;
+-                  *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T)  = src8c;
+-                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
+-                  *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T)  = src5d;
+-                  *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T)  = src6d;
+-                  *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T)  = src7d;
+-                  *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T)  = src8d;
+-                  return TRUE;
+-               }
+-               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
+-                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
+-                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
+-                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
+-                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
+-                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
+-                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
+-                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
+-                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
+-                  uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
+-                  uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
+-                  uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
+-                  uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
+-                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7a;
+-                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8a;
+-                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9a;
+-                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7b;
+-                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8b;
+-                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9b;
+-                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7c;
+-                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8c;
+-                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9c;
+-                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7d;
+-                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8d;
+-                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9d;
+-                  return TRUE;
+-               }
+-               else {
+-                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
+-                  // Instead, just call multiple fixed functions.
+-                  if (xdir >= 0) {
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                  } else {
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
+-                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
+-                  }
+-                  return TRUE;
+-               }
+-               break;
+-      // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons?
+-      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here...
+-      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
+-                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T));
+-                  uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T));
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
+-                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
+-                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c);
+-                  vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d);
+-                  vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d);
+-                  return TRUE;
+-               }
+-               break;
+-   }
++         }
+    return FALSE;
+ }
+@@ -1924,10 +872,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
+       if (rowsOverlap)
+       {
+          if (w > 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++          DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else if (w == 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++          DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else {
+             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
+@@ -1936,10 +886,12 @@ swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int
+       else
+       {
+          if (w > 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++          DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else if (w == 64) {
+-            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++            //DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
++          DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
+          }
+          else {
+             DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
+@@ -1973,7 +925,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
+       if (xdir >= 0 || !rowsOverlap) {
+          if (w >= 128) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memcpy(dst, src, w);
++            //neon_memcpy(dst, src, w);
++          memcpy(dst, src, w);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+@@ -1982,7 +935,8 @@ swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, i
+       else {
+          if (w >= 128) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memmove(dst, src, w);
++            //neon_memmove(dst, src, w);
++          memmove(dst, src, w);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+@@ -2029,7 +983,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
+       if (xdir >= 0 || !rowsOverlap) {
+          if (w >= 42) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++            //neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++          memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+@@ -2038,7 +993,8 @@ swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h,
+       else {
+          if (w >= 42) {
+             BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
+-            neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++            //neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
++          memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
+             UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
+          }
+          else
+diff --git git/src/neon_memcpy.S git/src/neon_memcpy.S
+deleted file mode 100644
+index 5ecc5ce..0000000
+--- git/src/neon_memcpy.S
++++ /dev/null
+@@ -1,549 +0,0 @@
+-/***************************************************************************
+- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+-
+- Redistribution and use in source and binary forms, with or without
+- modification, are permitted provided that the following conditions are met:
+-     * Redistributions of source code must retain the above copyright
+-       notice, this list of conditions and the following disclaimer.
+-     * Redistributions in binary form must reproduce the above copyright
+-       notice, this list of conditions and the following disclaimer in the
+-       documentation and/or other materials provided with the distribution.
+-     * Neither the name of Code Aurora nor the names of its contributors may
+-       be used to endorse or promote products derived from this software
+-       without specific prior written permission.
+-
+- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+- POSSIBILITY OF SUCH DAMAGE.
+- ***************************************************************************/
+-
+-/***************************************************************************
+-  Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
+-     Inputs:
+-        dest: The destination buffer
+-        src: The source buffer
+-        n: The size of the buffer to transfer
+-     Outputs:
+-
+-***************************************************************************/
+-
+-/*
+- * General note:
+- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
+- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
+- * the correct object code for VPOP, resulting in horrific stack crashes.
+- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
+- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
+- *
+- * Also, VSHL swaps the source register and the shift-amount register
+- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
+- * in the object code, but we'll need to undo that later...
+- */
+-
+-      .code 32
+-      .align 4
+-      .globl neon_memcpy
+-      .func
+-
+-neon_memcpy:
+-      /* 
+-       * First, make sure we're not copying < 4 bytes.  If so, we'll
+-         * just handle it here.
+-       */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r0}
+-#else
+-      push            {r0}
+-#endif
+-      cmp             r2, #4
+-      bgt             neon_gt_4
+-      /* Copy 0-4 bytes, if needed, and return.*/
+-      cmp             r2, #0
+-neon_smallcopy_loop:
+-      beq             neon_smallcopy_done
+-      ldrb            r12, [r1], #1
+-      subs            r2, r2, #1
+-      strb            r12, [r0], #1
+-      b               neon_smallcopy_loop
+-neon_smallcopy_done:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r0}
+-#else
+-      pop             {r0}
+-#endif
+-      bx              lr
+-
+-      /* Copy 4 or more bytes*/
+-neon_gt_4:
+-      /* Preload what we can...*/
+-      pld             [r0,#0]
+-      pld             [r1,#0]
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r4-r5}
+-#else
+-      push            {r4-r5}
+-#endif
+-
+-neon_check_align:
+-      /* Check normal word alignment for target. */
+-      ands            r12, r0, #0x3
+-      beq             source_alignment_check
+-      
+-      /*
+-       * Target is not aligned.  Step through until we get that
+-       * word-aligned.  This works better than a loop, according
+-       * to our pipeline modeler.
+-       */
+-      cmp             r12, #2
+-      ldrb            r3, [r1], #1
+-      ldrleb          r4, [r1], #1
+-      ldrltb          r5, [r1], #1
+-      rsb             r12, r12, #4
+-      sub             r2, r2, r12
+-      strb            r3, [r0], #1
+-      strleb          r4, [r0], #1
+-      strltb          r5, [r0], #1
+-      
+-source_alignment_check:
+-      ands            r12, r1, #0x3
+-      bne             neon_memcpy_nonaligned  /* Source is not word aligned.*/
+-neon_try_16_align:
+-      cmp             r2, #64
+-      blt             neon_align_route
+-      /* This is where we try 16-byte alignment. */
+-      ands            r12, r0, #0xf
+-      beq             neon_align_route
+-      rsb             r12, r12, #16
+-neon_16_start:
+-      sub             r2, r2, r12
+-      lsrs            r3, r12, #2
+-neon_align_16_4:
+-      ldr             r4, [r1], #4
+-      subs            r3, r3, #1
+-      str             r4, [r0], #4
+-      bne             neon_align_16_4
+-neon_align_route:
+-      /* In this case, both source and target are word-aligned. */
+-      cmp             r2, #32768
+-      bge             neon_copy_128p_a
+-      cmp             r2, #256
+-      bge             neon_copy_128_a
+-      cmp             r2, #64
+-      bge             neon_copy_32_a
+-      b               neon_copy_finish_a
+-      nop
+-neon_copy_128p_a:
+-      /* We'll copy blocks 128-bytes at a time, but try to call pld to
+-       * load in the next page, if possible.
+-       */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4-q7}
+-#else
+-      vpush           {q4-q7}
+-#endif
+-      mov             r12, r2, lsr #7
+-neon_copy_128p_loop_a:
+-      vld1.32         {q0, q1}, [r1]!
+-      vld1.32         {q2, q3}, [r1]!
+-      vld1.32         {q4, q5}, [r1]!
+-      vld1.32         {q6, q7}, [r1]!
+-      pld             [r1, #0]
+-      pld             [r1, #1024]
+-      vst1.32         {q0, q1}, [r0]!
+-      vst1.32         {q2, q3}, [r0]!
+-      vst1.32         {q4, q5}, [r0]!
+-      vst1.32         {q6, q7}, [r0]!
+-      subs            r12, r12, #1
+-      bne             neon_copy_128p_loop_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q4-q7}
+-#else
+-      vpop            {q4-q7}
+-#endif
+-      ands            r2, r2, #0x7f
+-      beq             neon_end
+-      cmp             r2, #32
+-      blt             neon_copy_finish_a
+-      b               neon_copy_32_a
+-      /* Copy blocks of 128-bytes (word-aligned) at a time*/
+-neon_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4-q7}
+-#else
+-      vpush           {q4-q7}
+-#endif
+-      /*
+-       * Move to a 1-s based countdown to determine when to loop.  That
+-       * allows the subs to set the Z flag without having to explicitly
+-       * call cmp to a value.
+-       */
+-      mov             r12, r2, lsr #7
+-neon_copy_128_loop_a:
+-      vld1.32         {q0, q1}, [r1]!
+-      vld1.32         {q2, q3}, [r1]!
+-      vld1.32         {q4, q5}, [r1]!
+-      vld1.32         {q6, q7}, [r1]!
+-      pld             [r1, #0]
+-      pld             [r1, #128]
+-      vst1.32         {q0, q1}, [r0]!
+-      vst1.32         {q2, q3}, [r0]!
+-      vst1.32         {q4, q5}, [r0]!
+-      vst1.32         {q6, q7}, [r0]!
+-      subs            r12, r12, #1
+-      pld             [r0, #0]
+-      pld             [r0, #128]
+-      bne             neon_copy_128_loop_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q4-q7}
+-#else
+-      vpop            {q4-q7}
+-#endif
+-      ands            r2, r2, #0x7f
+-      beq             neon_end
+-      cmp             r2, #32
+-      blt             neon_copy_finish_a
+-      /* Copy blocks of 32-bytes (word aligned) at a time*/
+-neon_copy_32_a:
+-      mov             r12, r2, lsr #5
+-neon_copy_32_loop_a:
+-      vld1.32         {q0,q1}, [r1]!
+-      subs            r12, r12, #1
+-      pld             [r1,#0]
+-      vst1.32         {q0,q1}, [r0]!
+-      bne             neon_copy_32_loop_a
+-      ands            r2, r2, #0x1f
+-      beq             neon_end
+-neon_copy_finish_a:
+-neon_copy_16_a:
+-      movs            r12, r2, lsr #4
+-      beq             neon_copy_8_a
+-neon_copy_16_a_loop:
+-      vld1.32         {q0}, [r1]!
+-      subs            r12, r12, #1
+-      vst1.32         {q0}, [r0]!
+-      bne             neon_copy_16_a_loop
+-      ands            r2, r2, #0xf
+-      beq             neon_end
+-neon_copy_8_a:
+-      cmp             r2, #8
+-      blt             neon_copy_4_a
+-      ldm             r1!, {r4-r5}
+-      subs            r2, r2, #8
+-      stm             r0!, {r4-r5}
+-      /* Copy 4-bytes of word-aligned data at a time*/
+-neon_copy_4_a:
+-      cmp             r2, #4
+-      blt             neon_copy_finish
+-      ldr             r4, [r1], #4
+-      subs            r2, r2, #4
+-      str             r4, [r0], #4
+-      b               neon_copy_finish
+-
+-      /*
+-       * Handle unaligned data.  The basic concept here is that we'll
+-       * try to pull out enough data from the source to get that word-
+-       * aligned, then do our writes word-aligned, storing the difference
+-       * in a register, and shifting the data as needed.
+-       */
+-neon_memcpy_nonaligned:
+-      /*
+-       * If this is <8 bytes, it makes more sense to just copy it
+-       * quickly instead of incurring all kinds of overhead.
+-       */
+-      cmp             r2, #8 /* Let's try this...*/
+-      ble             neon_copy_finish
+-      /*
+-       * This is where we'll pull out either 1, 2, or 3 bytes of data
+-       * from the source as needed to align it, then store off those
+-       * bytes in r4.  When we read in the (now) aligned data from the
+-       * source, we'll shift the bytes and AND in the r4 data, then write
+-       * to the target aligned.
+-       *
+-       * The conditional ldr calls work slightly faster than the
+-       * previous method, confirmed by our pipeline modeler.
+-       */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r6-r9}
+-#else
+-      push            {r6-r9}
+-#endif
+-      cmp             r12, #2
+-      ldrb            r4, [r1], #1
+-      ldrleb          r5, [r1], #1
+-      ldrltb          r6, [r1], #1
+-      rsb             r8, r12, #4
+-      sub             r2, r2, r8
+-      lsl             r8, r8, #3
+-      orrle           r4, r4, r5, lsl #8
+-      orrlt           r4, r4, r6, lsl #16
+-      rsb             r9, r8, #32
+-      
+-      cmp             r2, #64
+-      blt             neon_unaligned_route
+-      ands            r12, r0, #0xf
+-      beq             neon_unaligned_route
+-      rsb             r12, r12, #16
+-neon_16_start_u:
+-      sub             r2, r2, r12
+-      lsrs            r6, r12, #2
+-neon_align_16_4_u:
+-      ldr             r5, [r1], #4
+-      subs            r6, r6, #1
+-      orr             r4, r4, r5, lsl r8
+-      str             r4, [r0], #4
+-      mov             r4, r5, lsr r9
+-      bne             neon_align_16_4_u
+-neon_unaligned_route:
+-      /* Decide which loop block to branch to.*/
+-      cmp             r2, #256
+-      bge             neon_copy_64_u
+-      cmp             r2, #64
+-      bge             neon_copy_32_u
+-      b               neon_copy_finish_u
+-      /* Copy data in 64-byte blocks.*/
+-neon_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4}
+-      vstmdb          sp!, {q5-q8}
+-#else
+-      vpush           {q4}
+-      vpush           {q5-q8}
+-#endif
+-      /* We'll need this for the q register shift later.*/
+-      vdup.u32        q8, r8
+-      /*
+-       * As above, we determine how many times we can go through the
+-       * 64-byte copy loop, then countdown.
+-       */
+-      mov             r12, r2, lsr #6
+-      and             r2, r2, #0x3f
+-neon_copy_64_u_loop:
+-      /* Load 64-bytes into q4-q7.*/
+-      vld1.32         {q4, q5}, [r1]!
+-      vld1.32         {q6, q7}, [r1]!
+-      /*
+-       * Shift q0-q3 right so everything but the data we need due to the
+-       * alignment falls off the right-hand side.  The branching
+-       * is needed, since vshr requires the shift to be an immediate
+-       * value.
+-       */
+-      lsls            r5, r8, #28
+-      bcc             neon_copy_64_u_b8
+-      bpl             neon_copy_64_u_b16
+-      vshr.u64        q0, q4, #40
+-      vshr.u64        q1, q5, #40
+-      vshr.u64        q2, q6, #40
+-      vshr.u64        q3, q7, #40
+-      b               neon_copy_64_unify
+-neon_copy_64_u_b8:
+-      vshr.u64        q0, q4, #56
+-      vshr.u64        q1, q5, #56
+-      vshr.u64        q2, q6, #56
+-      vshr.u64        q3, q7, #56
+-      b               neon_copy_64_unify
+-neon_copy_64_u_b16:
+-      vshr.u64        q0, q4, #48
+-      vshr.u64        q1, q5, #48
+-      vshr.u64        q2, q6, #48
+-      vshr.u64        q3, q7, #48
+-neon_copy_64_unify:
+-      /*
+-       * Shift q4-q7 left by r8 bits to take the alignment into
+-       * account.
+-       */
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q4, q8, q4
+-      vshl.u64        q5, q8, q5
+-      vshl.u64        q6, q8, q6
+-      vshl.u64        q7, q8, q7
+-#else
+-      vshl.u64        q4, q4, q8
+-      vshl.u64        q5, q5, q8
+-      vshl.u64        q6, q6, q8
+-      vshl.u64        q7, q7, q8
+-#endif
+-      /*
+-       * The data in s14 will be needed for the next loop iteration.  Move
+-       * that to r5.
+-       */
+-      vmov            r5, s14
+-      /* We'll vorr the shifted data with the data that needs to move back.*/
+-      vorr            d9, d9, d0
+-      /* Copy the data from the previous loop into s14.*/
+-      vmov            s14, r4
+-      vorr            d10, d10, d1
+-      vorr            d11, d11, d2
+-      vorr            d12, d12, d3
+-      vorr            d13, d13, d4
+-      vorr            d14, d14, d5
+-      vorr            d15, d15, d6
+-      vorr            d8, d8, d7
+-      subs            r12, r12, #1
+-      pld             [r1, #0]
+-      pld             [r1, #128]
+-      /* Save off the r5 data into r4 for the next iteration.*/
+-      mov             r4, r5
+-      vst1.32         {q4, q5}, [r0]!
+-      vst1.32         {q6, q7}, [r0]!
+-      pld             [r0, #0]
+-      pld             [r0, #128]
+-      bne             neon_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q5-q8}
+-      vldmia          sp!, {q4}
+-#else
+-      vpop            {q5-q8}
+-      vpop            {q4}
+-#endif
+-      cmp             r2, #32
+-      bge             neon_copy_32_u
+-      b               neon_copy_finish_u
+-neon_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4}
+-#else
+-      vpush           {q4}
+-#endif
+-      vdup.u32        q4, r8
+-      mov             r12, r2, lsr #5
+-      and             r2, r2, #0x1f
+-neon_copy_32_u_loop:
+-      vld1.32         {q0, q1}, [r1]!
+-      lsls            r5, r8, #28
+-      bcc             neon_copy_32_u_b8
+-      bpl             neon_copy_32_u_b16
+-      vshr.u64        q2, q0, #40
+-      vshr.u64        q3, q1, #40
+-      b               neon_copy_32_unify
+-neon_copy_32_u_b8:
+-      vshr.u64        q2, q0, #56
+-      vshr.u64        q3, q1, #56
+-      b               neon_copy_32_unify
+-neon_copy_32_u_b16:
+-      vshr.u64        q2, q0, #48
+-      vshr.u64        q3, q1, #48
+-neon_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q0, q4, q0
+-      vshl.u64        q1, q4, q1
+-#else
+-      vshl.u64        q0, q0, q4
+-      vshl.u64        q1, q1, q4
+-#endif
+-      vmov            r5, s14
+-      vorr            d1, d1, d4
+-      vmov            s14, r4
+-      vorr            d2, d2, d5
+-      vorr            d3, d3, d6
+-      vorr            d0, d0, d7
+-      subs            r12, r12, #1
+-      pld             [r1, #0]
+-      mov             r4, r5
+-      vst1.32         {q0, q1}, [r0]!
+-      bne             neon_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q4}
+-#else
+-      vpop            {q4}
+-#endif
+-neon_copy_finish_u:
+-neon_copy_16_u:
+-      movs            r12, r2, lsr #4
+-      beq             neon_copy_8_u
+-      vdup.u32        q2, r8
+-      and             r2, r2, #0xf
+-neon_copy_16_u_loop:
+-      vld1.32         {q0}, [r1]!
+-      lsls            r5, r8, #28
+-      bcc             neon_copy_16_u_b8
+-      bpl             neon_copy_16_u_b16
+-      vshr.u64        q1, q0, #40
+-      b               neon_copy_16_unify
+-neon_copy_16_u_b8:
+-      vshr.u64        q1, q0, #56
+-      b               neon_copy_16_unify
+-neon_copy_16_u_b16:
+-      vshr.u64        q1, q0, #48
+-neon_copy_16_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q0, q2, q0
+-#else
+-      vshl.u64        q0, q0, q2
+-#endif
+-      vmov            r5, s6
+-      vorr            d1, d1, d2
+-      vmov            s6, r4
+-      vorr            d0, d0, d3
+-      subs            r12, r12, #1
+-      mov             r4, r5
+-      vst1.32         {q0}, [r0]!
+-      bne             neon_copy_16_u_loop
+-neon_copy_8_u:
+-      cmp             r2, #8
+-      blt             neon_copy_4_u
+-      ldm             r1!, {r6-r7}
+-      subs            r2, r2, #8
+-      orr             r4, r4, r6, lsl r8
+-      mov             r5, r6, lsr r9
+-      orr             r5, r5, r7, lsl r8
+-      stm             r0!, {r4-r5}
+-      mov             r4, r7, lsr r9
+-neon_copy_4_u:
+-      cmp             r2, #4
+-      blt             neon_copy_last_bits_u
+-      ldr             r5, [r1], #4
+-      subs            r2, r2, #4
+-      orr             r4, r4, r5, lsl r8
+-      str             r4, [r0], #4
+-      mov             r4, r5, lsr r9
+-neon_copy_last_bits_u:
+-      /*
+-       * Remember, r8 contains the size of the data in r4 in bits,
+-       * so to get to bytes we'll need to shift 3 places
+-       */
+-      lsr             r8, r8, #0x3
+-      /* Write out the bytes stored in r4.*/
+-neon_copy_last_bits_u_loop:
+-      strb            r4, [r0], #1
+-      subs            r8, r8, #1
+-      lsrne           r4, r4, #8
+-      bne             neon_copy_last_bits_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r6-r9}
+-#else
+-      pop             {r6-r9}
+-#endif
+-neon_copy_finish:
+-      cmp             r2, #0
+-      beq             neon_end
+-      /*
+-       * This just copies the data from source to target one byte
+-       * at a time.  For some small values, this makes more sense.
+-       * Note that since this code copies data a byte at a time,
+-       * both the aligned and unaligned paths can use it.
+-       */
+-neon_copy_finish_loop:
+-      ldrb            r4, [r1], #1
+-      subs            r2, r2, #1
+-      strb            r4, [r0], #1
+-      bne             neon_copy_finish_loop
+-neon_end:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r4-r5}
+-      ldmia           sp!, {r0}
+-#else
+-      pop             {r4-r5}
+-      pop             {r0}
+-#endif
+-      bx              lr
+-
+-      .endfunc
+-      .end
+diff --git git/src/neon_memmove.S git/src/neon_memmove.S
+deleted file mode 100644
+index 1bfe597..0000000
+--- git/src/neon_memmove.S
++++ /dev/null
+@@ -1,939 +0,0 @@
+-/***************************************************************************
+- Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+-
+- Redistribution and use in source and binary forms, with or without
+- modification, are permitted provided that the following conditions are met:
+-     * Redistributions of source code must retain the above copyright
+-       notice, this list of conditions and the following disclaimer.
+-     * Redistributions in binary form must reproduce the above copyright
+-       notice, this list of conditions and the following disclaimer in the
+-       documentation and/or other materials provided with the distribution.
+-     * Neither the name of Code Aurora nor the names of its contributors may
+-       be used to endorse or promote products derived from this software
+-       without specific prior written permission.
+-
+- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+- POSSIBILITY OF SUCH DAMAGE.
+-  ***************************************************************************/
+-
+-/***************************************************************************
+- *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+- *     Inputs:
+- *        dest: The destination buffer
+- *        src: The source buffer
+- *        n: The size of the buffer to transfer
+- *     Outputs:
+- *
+- ***************************************************************************/
+-
+-/*
+- * General note:
+- * The original code that was compiled for rvct used PUSH/POP and VPUSH/VPOP
+- * However, it looks like the 2006 CodeSourcery Assembler has issues generating
+- * the correct object code for VPOP, resulting in horrific stack crashes.
+- * As a result, I've temporarily move PUSH->STMDB, POP->LDMIA, VPUSH->VSTMDB,
+- * and VPOP->VLDMIA.  We can revert this back once we update our toolchain.
+- *
+- * Also, VSHL swaps the source register and the shift-amount register
+- * around in 2006-q3.  I've coded this incorrectly so it turns out correct
+- * in the object code, but we'll need to undo that later...
+- */
+-      .code 32
+-      .align 4
+-      .globl neon_memmove
+-      .func
+-
+-neon_memmove:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r0}
+-#else
+-      push            {r0}
+-#endif
+-
+-      /*
+-       * The requirements for memmove state that the function should
+-       * operate as if data were being copied from the source to a
+-       * buffer, then to the destination.  This is to allow a user
+-       * to copy data from a source and target that overlap.
+-       *
+-       * We can't just do byte copies front-to-back automatically, since
+-       * there's a good chance we may have an overlap (why else would someone
+-       * intentionally use memmove then?).
+-       *
+-       * We'll break this into two parts.  Front-to-back, or back-to-front
+-       * copies.
+-       */
+-neon_memmove_cmf:
+-      cmp             r0, r1
+-      blt             neon_front_to_back_copy
+-      bgt             neon_back_to_front_copy
+-      b               neon_memmove_done
+-
+-      /* #############################################################
+-       * Front to Back copy
+-       */
+-neon_front_to_back_copy:
+-      /*
+-       * For small copies, just do a quick memcpy.  We can do this for
+-       * front-to-back copies, aligned or unaligned, since we're only
+-       * doing 1 byte at a time...
+-       */
+-      cmp             r2, #4
+-      bgt             neon_f2b_gt4
+-      cmp             r2, #0
+-neon_f2b_smallcopy_loop:
+-      beq             neon_memmove_done
+-      ldrb            r12, [r1], #1
+-      subs            r2, r2, #1
+-      strb            r12, [r0], #1
+-      b               neon_f2b_smallcopy_loop
+-neon_f2b_gt4:
+-      /* Preload what we can...*/
+-      pld             [r0,#0]
+-      pld             [r1,#0] 
+-      /* The window size is in r3. */
+-      sub             r3, r1, r0
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r4-r6}
+-#else
+-      push            {r4-r6}
+-#endif
+-
+-neon_f2b_check_align:
+-      /* Check alignment. */
+-      ands            r12, r0, #0x3
+-      beq             neon_f2b_source_align_check
+-      cmp             r12, #2
+-      ldrb            r4, [r1], #1
+-      ldrleb          r5, [r1], #1
+-      ldrltb          r6, [r1], #1
+-      rsb             r12, r12, #4
+-      sub             r2, r2, r12
+-      strb            r4, [r0], #1
+-      strleb          r5, [r0], #1
+-      strltb          r6, [r0], #1
+-      
+-neon_f2b_source_align_check:
+-      ands            r12, r1, #0x3
+-      bne             neon_f2b_nonaligned
+-
+-neon_f2b_try_16_align:
+-      /* If we're >64, attempt to align on 16-bytes.  Smaller amounts
+-       * don't seem to be worth handling. */
+-      cmp             r2, #64
+-      blt             neon_f2b_align_route
+-      /* This is where we try 16-byte alignment. */
+-      ands            r12, r0, #0xf
+-      beq             neon_f2b_align_route
+-      rsb             r12, r12, #16
+-neon_f2b_16_start:
+-      sub             r2, r2, r12
+-      lsrs            r5, r12, #2
+-neon_f2b_align_16_4:
+-      ldr             r4, [r1], #4
+-      subs            r5, r5, #1
+-      str             r4, [r0], #4
+-      bne             neon_f2b_align_16_4
+-neon_f2b_align_route:
+-      /* #############################################################
+-       * Front to Back copy - aligned
+-       */
+-      /*
+-       * Note that we can't just route based on the size in r2.  If that's
+-       * larger than the overlap window in r3, we could potentially
+-       * (and likely!) destroy data we're copying.
+-       */
+-      cmp             r2, r3
+-      movle           r12, r2
+-      movgt           r12, r3
+-      cmp             r12, #256
+-      bge             neon_f2b_copy_128_a
+-      cmp             r12, #64
+-      bge             neon_f2b_copy_32_a
+-      cmp             r12, #16
+-      bge             neon_f2b_copy_16_a
+-      cmp             r12, #8
+-      bge             neon_f2b_copy_8_a
+-      cmp             r12, #4
+-      bge             neon_f2b_copy_4_a
+-      b               neon_f2b_copy_1_a
+-neon_f2b_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4-q7}
+-#else
+-      vpush           {q4-q7}
+-#endif
+-      mov             r12, r2, lsr #7
+-neon_f2b_copy_128_a_loop:
+-      vld1.32         {q0,q1}, [r1]!
+-      vld1.32         {q2,q3}, [r1]!
+-      vld1.32         {q4,q5}, [r1]!
+-      vld1.32         {q6,q7}, [r1]!
+-      pld             [r1, #0]
+-      pld             [r1, #128]
+-      vst1.32         {q0,q1}, [r0]!
+-      vst1.32         {q2,q3}, [r0]!
+-      vst1.32         {q4,q5}, [r0]!
+-      vst1.32         {q6,q7}, [r0]!
+-      subs            r12, r12, #1
+-      pld             [r0, #0]
+-      pld             [r0, #128]
+-      bne             neon_f2b_copy_128_a_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q4-q7}
+-#else
+-      vpop            {q4-q7}
+-#endif
+-      ands            r2, r2, #0x7f
+-      beq             neon_f2b_finish
+-      cmp             r2, #32
+-      bge             neon_f2b_copy_32_a
+-      b               neon_f2b_copy_finish_a
+-neon_f2b_copy_32_a:
+-      mov             r12, r2, lsr #5
+-neon_f2b_copy_32_a_loop:
+-      vld1.32         {q0,q1}, [r1]!
+-      subs            r12, r12, #1
+-      pld             [r1, #0]
+-      vst1.32         {q0,q1}, [r0]!
+-      bne             neon_f2b_copy_32_a_loop
+-      ands            r2, r2, #0x1f
+-      beq             neon_f2b_finish
+-neon_f2b_copy_finish_a:
+-neon_f2b_copy_16_a:
+-      movs            r12, r2, lsr #4
+-      beq             neon_f2b_copy_8_a
+-neon_f2b_copy_16_a_loop:
+-      vld1.32         {q0}, [r1]!
+-      subs            r12, r12, #1
+-      vst1.32         {q0}, [r0]!
+-      bne             neon_f2b_copy_16_a_loop
+-      ands            r2, r2, #0xf
+-      beq             neon_f2b_finish
+-neon_f2b_copy_8_a:
+-      cmp             r2, #8
+-      blt             neon_f2b_copy_4_a
+-      ldm             r1!, {r4-r5}
+-      subs            r2, r2, #8
+-      stm             r0!, {r4-r5}
+-neon_f2b_copy_4_a:
+-      cmp             r2, #4
+-      blt             neon_f2b_copy_1_a
+-      ldr             r4, [r1], #4
+-      subs            r2, r2, #4
+-      str             r4, [r0], #4
+-neon_f2b_copy_1_a:
+-      cmp             r2, #0
+-      beq             neon_f2b_finish
+-neon_f2b_copy_1_a_loop:
+-      ldrb            r12, [r1], #1
+-      subs            r2, r2, #1
+-      strb            r12, [r0], #1
+-      bne             neon_f2b_copy_1_a_loop
+-      b               neon_f2b_finish
+-              
+-      /* #############################################################
+-       * Front to Back copy - unaligned
+-       */
+-neon_f2b_nonaligned:
+-      /*
+-       * For sizes < 8, does it really make sense to do the whole shift
+-       * party?  Note that we DON'T want to call neon_f2b_copy_1_u,
+-       * since we'll end up trying to pop r8-r11, and we DON'T want
+-       * to do that...
+-       */
+-      cmp             r2, #8
+-      ble             neon_f2b_copy_1_a
+-
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r7-r9}
+-#else
+-      push            {r7-r9}
+-#endif
+-      cmp             r12, #2
+-      ldrb            r4, [r1], #1
+-      ldrleb          r5, [r1], #1
+-      ldrltb          r6, [r1], #1
+-      rsb             r8, r12, #4
+-      sub             r2, r2, r8
+-      lsl             r8, r8, #3
+-      orrle           r4, r4, r5, lsl #8
+-      orrlt           r4, r4, r6, lsl #16
+-      rsb             r9, r8, #32
+-      /*
+-       * r4  = overflow bits
+-       * r8 = # of bits we copied into the r4 register to align source.
+-       * r9 = 32 - r8
+-       * r12 = Index counter for each size, so we determine how many times
+-       *       the given size will go into r2, then count down that # of
+-       *       times in r12.
+-       */
+-      cmp             r2, #64
+-      blt             neon_f2b_unaligned_route
+-      ands            r12, r0, #0xf
+-      beq             neon_f2b_unaligned_route
+-      cmp             r3, #4
+-      blt             neon_f2b_unaligned_route
+-      rsb             r12, r12, #16
+-neon_f2b_16_start_u:
+-      sub             r2, r2, r12
+-      lsrs            r6, r12, #2
+-neon_f2b_align_16_4_u:
+-      ldr             r5, [r1], #4
+-      subs            r6, r6, #1
+-      orr             r4, r4, r5, lsl r8
+-      str             r4, [r0], #4
+-      mov             r4, r5, lsr r9
+-      bne             neon_f2b_align_16_4_u
+-neon_f2b_unaligned_route:
+-      cmp             r2, r3
+-      movle           r12, r2
+-      movgt           r12, r3
+-      cmp             r12, #256
+-      bge             neon_f2b_copy_64_u
+-      cmp             r12, #64
+-      bge             neon_f2b_copy_32_u
+-      cmp             r12, #16
+-      bge             neon_f2b_copy_16_u
+-      cmp             r12, #8
+-      bge             neon_f2b_copy_8_u
+-      cmp             r12, #4
+-      bge             neon_f2b_copy_4_u
+-      b               neon_f2b_last_bits_u
+-neon_f2b_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4}
+-      vstmdb          sp!, {q5-q8}
+-#else
+-      vpush           {q4}
+-      vpush           {q5-q8}
+-#endif
+-      vdup.u32        q8, r8
+-      mov             r12, r2, lsr #6
+-      and             r2, r2, #0x3f
+-neon_f2b_copy_64_u_loop:
+-      vld1.32         {q4, q5}, [r1]!
+-      vld1.32         {q6, q7}, [r1]!
+-      lsls            r5, r8, #28
+-      bcc             neon_f2b_copy_64_u_b8
+-      bpl             neon_f2b_copy_64_u_b16
+-      vshr.u64        q0, q4, #40
+-      vshr.u64        q1, q5, #40
+-      vshr.u64        q2, q6, #40
+-      vshr.u64        q3, q7, #40
+-      b               neon_f2b_copy_64_unify
+-neon_f2b_copy_64_u_b8:
+-      vshr.u64        q0, q4, #56
+-      vshr.u64        q1, q5, #56
+-      vshr.u64        q2, q6, #56
+-      vshr.u64        q3, q7, #56
+-      b               neon_f2b_copy_64_unify
+-neon_f2b_copy_64_u_b16:
+-      vshr.u64        q0, q4, #48
+-      vshr.u64        q1, q5, #48
+-      vshr.u64        q2, q6, #48
+-      vshr.u64        q3, q7, #48
+-neon_f2b_copy_64_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q4, q8, q4
+-      vshl.u64        q5, q8, q5
+-      vshl.u64        q6, q8, q6
+-      vshl.u64        q7, q8, q7
+-#else
+-      vshl.u64        q4, q4, q8
+-      vshl.u64        q5, q5, q8
+-      vshl.u64        q6, q6, q8
+-      vshl.u64        q7, q7, q8
+-#endif
+-      vmov            r5, s14
+-      vorr            d9, d9, d0
+-      vmov            s14, r4
+-      vorr            d10, d10, d1
+-      vorr            d11, d11, d2
+-      vorr            d12, d12, d3
+-      vorr            d13, d13, d4
+-      vorr            d14, d14, d5
+-      vorr            d15, d15, d6
+-      vorr            d8, d8, d7
+-      subs            r12, r12, #1
+-      pld             [r1, #0]
+-      pld             [r1, #128]
+-      mov             r4, r5
+-      vst1.32         {q4, q5}, [r0]!
+-      vst1.32         {q6, q7}, [r0]!
+-      pld             [r0, #0]
+-      pld             [r0, #128]
+-      bne             neon_f2b_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q5-q8}
+-      vldmia          sp!, {q4}
+-#else
+-      vpop            {q5-q8}
+-      vpop            {q4}
+-#endif
+-      cmp             r2, #32
+-      bge             neon_f2b_copy_32_u
+-      b               neon_f2b_copy_finish_u
+-neon_f2b_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4}
+-#else
+-      vpush           {q4}
+-#endif
+-      vdup.u32        q4, r8
+-      mov             r12, r2, lsr #5
+-      and             r2, r2, #0x1f
+-neon_f2b_copy_32_u_loop:
+-      vld1.32         {q0, q1}, [r1]!
+-      lsls            r5, r8, #28
+-      bcc             neon_f2b_copy_32_u_b8
+-      bpl             neon_f2b_copy_32_u_b16
+-      vshr.u64        q2, q0, #40
+-      vshr.u64        q3, q1, #40
+-      b               neon_f2b_copy_32_unify
+-neon_f2b_copy_32_u_b8:
+-      vshr.u64        q2, q0, #56
+-      vshr.u64        q3, q1, #56
+-      b               neon_f2b_copy_32_unify
+-neon_f2b_copy_32_u_b16:
+-      vshr.u64        q2, q0, #48
+-      vshr.u64        q3, q1, #48
+-neon_f2b_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q0, q4, q0
+-      vshl.u64        q1, q4, q1
+-#else
+-      vshl.u64        q0, q0, q4
+-      vshl.u64        q1, q1, q4
+-#endif
+-      vmov            r5, s14
+-      vorr            d1, d1, d4
+-      vmov            s14, r4
+-      vorr            d2, d2, d5
+-      vorr            d3, d3, d6
+-      vorr            d0, d0, d7
+-      subs            r12, r12, #1
+-      pld             [r1, #0]
+-      mov             r4, r5
+-      vst1.32         {q0, q1}, [r0]!
+-      bne             neon_f2b_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q4}
+-#else
+-      vpop            {q4}
+-#endif
+-neon_f2b_copy_finish_u:
+-neon_f2b_copy_16_u:
+-      movs            r12, r2, lsr #4
+-      beq             neon_f2b_copy_8_u
+-      vdup.u32        q2, r8
+-      and             r2, r2, #0xf
+-neon_f2b_copy_16_u_loop:
+-      vld1.32         {q0}, [r1]!
+-      lsls            r5, r8, #28
+-      bcc             neon_f2b_copy_16_u_b8
+-      bpl             neon_f2b_copy_16_u_b16
+-      vshr.u64        q1, q0, #40
+-      b               neon_f2b_copy_16_unify
+-neon_f2b_copy_16_u_b8:
+-      vshr.u64        q1, q0, #56
+-      b               neon_f2b_copy_16_unify
+-neon_f2b_copy_16_u_b16:
+-      vshr.u64        q1, q0, #48
+-neon_f2b_copy_16_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q0, q2, q0
+-#else
+-      vshl.u64        q0, q0, q2
+-#endif
+-      vmov            r5, s6
+-      vorr            d1, d1, d2
+-      vmov            s6, r4
+-      vorr            d0, d0, d3
+-      subs            r12, r12, #1
+-      mov             r4, r5
+-      vst1.32         {q0}, [r0]!
+-      bne             neon_f2b_copy_16_u_loop
+-neon_f2b_copy_8_u:
+-      cmp             r2, #8
+-      blt             neon_f2b_copy_4_u
+-      ldm             r1!, {r6-r7}
+-      subs            r2, r2, #8
+-      orr             r4, r4, r6, lsl r8
+-      mov             r5, r6, lsr r9
+-      orr             r5, r5, r7, lsl r8
+-      stm             r0!, {r4-r5}
+-      mov             r4, r7, lsr r9
+-neon_f2b_copy_4_u:
+-      cmp             r2, #4
+-      blt             neon_f2b_last_bits_u
+-      ldr             r5, [r1], #4
+-      subs            r2, r2, #4
+-      orr             r4, r4, r5, lsl r8
+-      str             r4, [r0], #4
+-      mov             r4, r5, lsr r9
+-neon_f2b_last_bits_u:
+-      lsr             r8, r8, #0x3
+-neon_f2b_last_bits_u_loop:
+-      strb            r4, [r0], #1
+-      subs            r8, r8, #1
+-      lsr             r4, r4, #8
+-      bne             neon_f2b_last_bits_u_loop
+-neon_f2b_copy_1_u:
+-      cmp             r2, #0
+-      beq             neon_f2b_finish_u
+-neon_f2b_copy_1_u_loop:
+-      ldrb            r12, [r1], #1
+-      subs            r2, r2, #1
+-      strb            r12, [r0], #1
+-      bne             neon_f2b_copy_1_u_loop
+-neon_f2b_finish_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r7-r9}
+-#else
+-      pop             {r7-r9}
+-#endif
+-      /* #############################################################
+-       * Front to Back copy - finish
+-       */
+-neon_f2b_finish:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r4-r6}
+-#else
+-      pop             {r4-r6}
+-#endif
+-      b               neon_memmove_done
+-
+-      /* #############################################################
+-       * Back to Front copy
+-       */
+-neon_back_to_front_copy:
+-      /*
+-       * Here, we'll want to shift to the end of the buffers.  This
+-       * actually points us one past where we need to go, but since
+-       * we'll pre-decrement throughout, this will be fine.
+-       */
+-      add             r0, r0, r2
+-      add             r1, r1, r2
+-      cmp             r2, #4
+-      bgt             neon_b2f_gt4
+-      cmp             r2, #0
+-neon_b2f_smallcopy_loop:
+-      beq             neon_memmove_done
+-      ldrb            r12, [r1, #-1]!
+-      subs            r2, r2, #1
+-      strb            r12, [r0, #-1]!
+-      b               neon_b2f_smallcopy_loop
+-neon_b2f_gt4:
+-      pld             [r0, #0]
+-      pld             [r1, #0]
+-      /*
+-       * The minimum of the overlap window size and the copy size
+-       * is in r3.
+-       */
+-      sub             r3, r0, r1
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r4-r5}
+-#else
+-      push            {r4-r5}
+-#endif
+-
+-      /*
+-       * Check alignment.  Since we'll pre-decrement as we step thru, we'll
+-       * need to make sure we're on word-alignment.
+-       */
+-neon_b2f_check_align:
+-      ands            r12, r0, #0x3
+-      beq             neon_b2f_source_align_check
+-      sub             r2, r2, r12
+-neon_b2f_shift_align:
+-      ldrb            r4, [r1, #-1]!
+-      subs            r12, r12, #1
+-      strb            r4, [r0, #-1]!
+-      bne             neon_b2f_shift_align
+-neon_b2f_source_align_check:
+-      ands            r4, r1, #0x3
+-      bne             neon_b2f_nonaligned
+-      
+-neon_b2f_try_16_align:
+-      /* If we're >64, attempt to align on 16-bytes.  Smaller amounts
+-       * don't seem to be worth handling. */
+-      cmp             r2, #64
+-      blt             neon_b2f_align_route
+-      ands            r12, r0, #0xf
+-      beq             neon_b2f_align_route
+-      /* In this case, r12 has the number of bytes to roll backward. */
+-neon_b2f_16_start:
+-      sub             r2, r2, r12
+-      lsrs            r5, r12, #2
+-neon_b2f_align_16_4:
+-      ldr             r4, [r1, #-4]!
+-      subs            r5, r5, #1
+-      str             r4, [r0, #-4]!
+-      bne             neon_b2f_align_16_4
+-neon_b2f_align_route:
+-      /*
+-       * #############################################################
+-       * Back to Front copy - aligned
+-       */
+-      cmp             r2, r3
+-      movle           r12, r2
+-      movgt           r12, r3
+-      cmp             r12, #256
+-      bge             neon_b2f_copy_128_a
+-      cmp             r12, #64
+-      bge             neon_b2f_copy_32_a
+-      cmp             r12, #8
+-      bge             neon_b2f_copy_8_a
+-      cmp             r12, #4
+-      bge             neon_b2f_copy_4_a
+-      b               neon_b2f_copy_1_a
+-neon_b2f_copy_128_a:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4-q7}
+-#else
+-      vpush           {q4-q7}
+-#endif
+-      movs            r12, r2, lsr #7
+-      /*
+-       * This irks me.  There MUST be a better way to read these in and
+-       * scan the register backward instead of making it go forward.  Then
+-       * we need to do two subtractions...
+-       */
+-neon_b2f_copy_128_a_loop:
+-      sub             r1, r1, #128
+-      sub             r0, r0, #128
+-      vld1.32         {q0, q1}, [r1]!
+-      vld1.32         {q2, q3}, [r1]!
+-      vld1.32         {q4, q5}, [r1]!
+-      vld1.32         {q6, q7}, [r1]!
+-      pld             [r1, #-128]
+-      pld             [r1, #-256]
+-      vst1.32         {q0, q1}, [r0]!
+-      vst1.32         {q2, q3}, [r0]!
+-      vst1.32         {q4, q5}, [r0]!
+-      vst1.32         {q6, q7}, [r0]!
+-      subs            r12, r12, #1
+-      pld             [r0, #-128]
+-      pld             [r0, #-256]
+-      sub             r1, r1, #128
+-      sub             r0, r0, #128
+-      bne             neon_b2f_copy_128_a_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q4-q7}
+-#else
+-      vpop            {q4-q7}
+-#endif
+-      ands            r2, r2, #0x7f
+-      beq             neon_b2f_finish
+-      cmp             r2, #32
+-      bge             neon_b2f_copy_32_a
+-      b               neon_b2f_copy_finish_a
+-neon_b2f_copy_32_a:
+-      mov             r12, r2, lsr #5
+-neon_b2f_copy_32_a_loop:
+-      sub             r1, r1, #32
+-      sub             r0, r0, #32
+-      vld1.32         {q0,q1}, [r1]
+-      subs            r12, r12, #1
+-      vst1.32         {q0,q1}, [r0]
+-      pld             [r1, #0]
+-      bne             neon_b2f_copy_32_a_loop
+-      ands            r2, r2, #0x1f
+-      beq             neon_b2f_finish
+-neon_b2f_copy_finish_a:
+-neon_b2f_copy_8_a:
+-      movs            r12, r2, lsr #0x3
+-      beq             neon_b2f_copy_4_a
+-neon_b2f_copy_8_a_loop:
+-      ldmdb           r1!, {r4-r5}
+-      subs            r12, r12, #1
+-      stmdb           r0!, {r4-r5}
+-      bne             neon_b2f_copy_8_a_loop
+-      and             r2, r2, #0x7
+-neon_b2f_copy_4_a:
+-      movs            r12, r2, lsr #0x2
+-      beq             neon_b2f_copy_1_a
+-      and             r2, r2, #0x3
+-neon_b2f_copy_4_a_loop:
+-      ldr             r4, [r1, #-4]!
+-      subs            r12, r12, #1
+-      str             r4, [r0, #-4]!
+-      bne             neon_b2f_copy_4_a_loop
+-neon_b2f_copy_1_a:
+-      cmp             r2, #0
+-      beq             neon_b2f_finish
+-neon_b2f_copy_1_a_loop:
+-      ldrb            r12, [r1, #-1]!
+-      subs            r2, r2, #1
+-      strb            r12, [r0, #-1]!
+-      bne             neon_b2f_copy_1_a_loop
+-
+-      /* #############################################################
+-       * Back to Front copy - unaligned
+-       */
+-neon_b2f_nonaligned:
+-      /*
+-       * For sizes < 8, does it really make sense to do the whole shift
+-       * party?
+-       */
+-      cmp             r2, #8
+-      ble             neon_b2f_copy_1_a
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      stmdb           sp!, {r6-r11}
+-#else
+-      push            {r6-r11}
+-#endif
+-      /*
+-       * r3 = max window size
+-       * r4 = overflow bytes
+-       * r5 = bytes we're reading into
+-       * r6 = # bytes we're off.
+-       * r10 = copy of r6
+-       */
+-      and             r6, r1, #0x3
+-      eor             r4, r4, r4
+-      mov             r10, r6
+-neon_b2f_realign:
+-      ldrb            r5, [r1, #-1]!
+-      subs            r6, r6, #1
+-      orr             r4, r5, r4, lsl #8
+-      bne             neon_b2f_realign
+-      /*
+-       * r10 = # of bits we copied into the r4 register to align source.
+-       * r11 = 32 - r10
+-       * r12 = Index counter for each size, so we determine how many times
+-       *       the given size will go into r2, then count down that # of
+-       *       times in r12.
+-       */
+-      sub             r2, r2, r10
+-      lsl             r10, r10, #0x3
+-      rsb             r11, r10, #32
+-
+-      cmp             r2, r3
+-      movle           r12, r2
+-      movgt           r12, r3
+-      cmp             r12, #256
+-      bge             neon_b2f_copy_64_u
+-      cmp             r12, #64
+-      bge             neon_b2f_copy_32_u
+-      cmp             r12, #8
+-      bge             neon_b2f_copy_8_u
+-      cmp             r12, #4
+-      bge             neon_b2f_copy_4_u
+-      b               neon_b2f_last_bits_u
+-neon_b2f_copy_64_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4,q5}
+-      vstmdb          sp!, {q6-q8}
+-#else
+-      vpush           {q4,q5}
+-      vpush           {q6-q8}
+-#endif
+-      add             r7, r11, #32
+-      movs            r12, r2, lsr #6
+-      vdup.u32        q8, r7
+-neon_b2f_copy_64_u_loop:
+-      sub             r1, r1, #64
+-      sub             r0, r0, #64
+-      vld1.32         {q0, q1}, [r1]!
+-      vld1.32         {q2, q3}, [r1]
+-      sub             r1, r1, #32
+-      vmov            q4, q0
+-      vmov            q5, q1
+-      vmov            q6, q2
+-      vmov            q7, q3
+-      vmov            r5, s0
+-      mov             r4, r4, lsl r11
+-      lsls            r6, r10, #28
+-      bcc             neon_b2f_copy_64_u_b8
+-      bpl             neon_b2f_copy_64_u_b16
+-      vshr.u64        q0, q0, #24
+-      vshr.u64        q1, q1, #24
+-      vshr.u64        q2, q2, #24
+-      vshr.u64        q3, q3, #24
+-      b               neon_b2f_copy_64_unify
+-neon_b2f_copy_64_u_b8:
+-      vshr.u64        q0, q0, #8
+-      vshr.u64        q1, q1, #8
+-      vshr.u64        q2, q2, #8
+-      vshr.u64        q3, q3, #8
+-      b               neon_b2f_copy_64_unify
+-neon_b2f_copy_64_u_b16:
+-      vshr.u64        q0, q0, #16
+-      vshr.u64        q1, q1, #16
+-      vshr.u64        q2, q2, #16
+-      vshr.u64        q3, q3, #16
+-neon_b2f_copy_64_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q4, q8, q4
+-      vshl.u64        q5, q8, q5
+-      vshl.u64        q6, q8, q6
+-      vshl.u64        q7, q8, q7
+-#else
+-      vshl.u64        q4, q4, q8
+-      vshl.u64        q5, q5, q8
+-      vshl.u64        q6, q6, q8
+-      vshl.u64        q7, q7, q8
+-#endif
+-      vmov            s17, r4
+-      vorr            d7, d7, d8
+-      vorr            d6, d6, d15
+-      vorr            d5, d5, d14
+-      vorr            d4, d4, d13
+-      vorr            d3, d3, d12
+-      vorr            d2, d2, d11
+-      vorr            d1, d1, d10
+-      vorr            d0, d0, d9
+-      mov             r4, r5, lsl r11
+-      subs            r12, r12, #1
+-      lsr             r4, r4, r11
+-      vst1.32         {q0, q1}, [r0]!
+-      vst1.32         {q2, q3}, [r0]
+-      pld             [r1, #0]
+-      sub             r0, r0, #32
+-      bne             neon_b2f_copy_64_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q6-q8}
+-      vldmia          sp!, {q4,q5}
+-#else
+-      vpop            {q6-q8}
+-      vpop            {q4,q5}
+-#endif
+-      ands            r2, r2, #0x3f
+-      cmp             r2, #32
+-      bge             neon_b2f_copy_32_u
+-      b               neon_b2f_copy_finish_u
+-neon_b2f_copy_32_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vstmdb          sp!, {q4}
+-#else
+-      vpush           {q4}
+-#endif
+-      add             r7, r11, #32
+-      movs            r12, r2, lsr #5
+-      vdup.u32        q4, r7
+-      and             r2, r2, #0x1f
+-neon_b2f_copy_32_u_loop:
+-      sub             r1, r1, #32
+-      sub             r0, r0, #32
+-      vld1.32         {q0, q1}, [r1]
+-      vmov            q2, q0
+-      vmov            q3, q1
+-      vmov            r5, s0
+-      mov             r4, r4, lsl r11
+-      lsls            r6, r10, #28
+-      bcc             neon_b2f_copy_32_u_b8
+-      bpl             neon_b2f_copy_32_u_b16
+-      vshr.u64        q0, q0, #24
+-      vshr.u64        q1, q1, #24
+-      b               neon_b2f_copy_32_unify
+-neon_b2f_copy_32_u_b8:
+-      vshr.u64        q0, q0, #8
+-      vshr.u64        q1, q1, #8
+-      b               neon_b2f_copy_32_unify
+-neon_b2f_copy_32_u_b16:
+-      vshr.u64        q0, q0, #16
+-      vshr.u64        q1, q1, #16
+-neon_b2f_copy_32_unify:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vshl.u64        q2, q4, q2
+-      vshl.u64        q3, q4, q3
+-#else
+-      vshl.u64        q2, q2, q4
+-      vshl.u64        q3, q3, q4
+-#endif
+-      vmov            s9, r4
+-      vorr            d3, d3, d4
+-      vorr            d2, d2, d7
+-      vorr            d1, d1, d6
+-      vorr            d0, d0, d5
+-      mov             r4, r5, lsl r11
+-      subs            r12, r12, #1
+-      lsr             r4, r4, r11
+-      vst1.32         {q0, q1}, [r0]
+-      pld             [r1, #0]
+-      bne             neon_b2f_copy_32_u_loop
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      vldmia          sp!, {q4}
+-#else
+-      vpop            {q4}
+-#endif
+-neon_b2f_copy_finish_u:
+-neon_b2f_copy_8_u:
+-      movs            r12, r2, lsr #0x3
+-      beq             neon_b2f_copy_4_u
+-      mov             r5, r4, lsl r11
+-neon_b2f_copy_8_u_loop:
+-      ldmdb           r1!, {r6-r7}
+-      subs            r12, r12, #1
+-      orr             r5, r5, r7, lsr r10
+-      mov             r4, r7, lsl r11
+-      orr             r4, r4, r6, lsr r10
+-      stmdb           r0!, {r4-r5}
+-      mov             r4, r6, lsl r11
+-      lsr             r4, r4, r11
+-      mov             r5, r4, lsl r11
+-      bne             neon_b2f_copy_8_u_loop
+-      ands            r2, r2, #0x7
+-neon_b2f_copy_4_u:
+-      movs            r12, r2, lsr #0x2
+-      beq             neon_b2f_last_bits_u
+-      mov             r5, r4, lsl r11
+-neon_b2f_copy_4_u_loop:
+-      ldr             r6, [r1, #-4]!
+-      subs            r12, r12, #1
+-      orr             r5, r5, r6, lsr r10
+-      str             r5, [r0, #-4]!
+-      mov             r4, r6, lsl r11
+-      lsr             r4, r4, r11
+-      mov             r5, r4, lsl r11
+-      bne             neon_b2f_copy_4_u_loop
+-      and             r2, r2, #0x3
+-neon_b2f_last_bits_u:
+-neon_b2f_last_bits_u_loop:
+-      subs            r10, r10, #8
+-      mov             r5, r4, lsr r10
+-      strb            r5, [r0, #-1]!
+-      bne             neon_b2f_last_bits_u_loop
+-neon_b2f_copy_1_u:
+-      cmp             r2, #0
+-      beq             neon_b2f_finish_u
+-neon_b2f_copy_1_u_loop:
+-      ldrb            r12, [r1, #-1]!
+-      subs            r2, r2, #1
+-      strb            r12, [r0, #-1]!
+-      bne             neon_b2f_copy_1_u_loop
+-neon_b2f_finish_u:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r6-r11}
+-#else
+-      pop             {r6-r11}
+-#endif
+-
+-neon_b2f_finish:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r4-r5}
+-#else
+-      pop             {r4-r5}
+-#endif
+-
+-neon_memmove_done:
+-#if defined __GNUC__ && (4 == __GNUC__ && 1 == __GNUC_MINOR__ && 1 == __GNUC_PATCHLEVEL__)
+-      ldmia           sp!, {r0}
+-#else
+-      pop             {r0}
+-#endif
+-      bx              lr
+-
+-      .endfunc
+-      .end
+diff --git git/src/neon_memsets.c git/src/neon_memsets.c
+deleted file mode 100755
+index 740fc1e..0000000
+--- git/src/neon_memsets.c
++++ /dev/null
+@@ -1,169 +0,0 @@
+-/* neon_memsets.c
+- *
+- * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+- *
+- * Redistribution and use in source and binary forms, with or without
+- * modification, are permitted provided that the following conditions are met:
+- *     * Redistributions of source code must retain the above copyright
+- *       notice, this list of conditions and the following disclaimer.
+- *     * Redistributions in binary form must reproduce the above copyright
+- *       notice, this list of conditions and the following disclaimer in the
+- *       documentation and/or other materials provided with the distribution.
+- *     * Neither the name of Code Aurora nor
+- *       the names of its contributors may be used to endorse or promote
+- *       products derived from this software without specific prior written
+- *       permission.
+- *
+- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+- * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+- * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+- */
+-
+-#include "msm-swblits.h"
+-
+-void memset16(uint16_t dst[], uint16_t value, int count)
+-{
+-    if (count <= 0)
+-        return;
+-
+-    asm volatile(
+-                 "       pld        [%[dst], #0]                           \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        6f                                     \n"
+-                 "       tst        %[dst], #0x3                           \n"
+-                 "       strneh     %[value], [%[dst]], #2                 \n"
+-                 "       subne      %[count], %[count], #1                 \n"
+-                 "       vdup.u16   q8, %[value]                           \n"
+-                 "       vmov       q9, q8                                 \n"
+-                 "       cmp        %[count], #64                          \n"
+-                 "       bge        0f                                     \n"
+-                 "       cmp        %[count], #32                          \n"
+-                 "       bge        2f                                     \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       bge        3f                                     \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       bge        4f                                     \n"
+-                 "       b          5f                                     \n"
+-                 "0:                                                       \n"
+-                 "       mov        r12, %[count], lsr #6                  \n"
+-                 "1:                                                       \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       r12, r12, #1                           \n"
+-                 "       bne        1b                                     \n"
+-                 "       ands       %[count], %[count], #0x3f              \n"
+-                 "       beq        7f                                     \n"
+-                 "2:                                                       \n"
+-                 "       cmp        %[count], #32                          \n"
+-                 "       blt        3f                                     \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #32                \n"
+-                 "       beq        7f                                     \n"
+-                 "3:                                                       \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       blt        4f                                     \n"
+-                 "       vst1.16    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #16                \n"
+-                 "       beq        7f                                     \n"
+-                 "4:                                                       \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       blt        5f                                     \n"
+-                 "       vst1.16    {q8}, [%[dst]]!                        \n"
+-                 "       subs       %[count], %[count], #8                 \n"
+-                 "       beq        7f                                     \n"
+-                 "5:                                                       \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        6f                                     \n"
+-                 "       vst1.16    {d16}, [%[dst]]!                       \n"
+-                 "       subs       %[count], %[count], #4                 \n"
+-                 "       beq        7f                                     \n"
+-                 "6:                                                       \n"
+-                 "       cmp        %[count], #0                           \n"
+-                 "       blt        7f                                     \n"
+-                 "       lsls       %[count], #31                          \n"
+-                 "       strmih     %[value], [%[dst]], #2                 \n"
+-                 "       strcsh     %[value], [%[dst]], #2                 \n"
+-                 "       strcsh     %[value], [%[dst]], #2                 \n"
+-                 "7:                                                       \n"
+-                 // Clobbered input registers
+-                 : [dst] "+r" (dst), [count] "+r" (count)
+-                 // Unclobbered input
+-                 : [value] "r" (value)
+-                 // Clobbered registers
+-                 : "q8", "q9", "r12", "cc", "memory"
+-                 );
+-}
+-
+-void memset32(uint32_t dst[], uint32_t value, int count)
+-{
+-    asm volatile(
+-                 "       pld        [%[dst], #0]                           \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        5f                                     \n"
+-                 "       vdup.u32   q8, %[value]                           \n"
+-                 "       vmov       q9, q8                                 \n"
+-                 "       cmp        %[count], #32                          \n"
+-                 "       bge        0f                                     \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       bge        2f                                     \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       bge        3f                                     \n"
+-                 "       b          4f                                     \n"
+-                 "0:                                                       \n"
+-                 "       mov        r12, %[count], lsr #5                  \n"
+-                 "1:                                                       \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       pld        [%[dst], #0]                           \n"
+-                 "       subs       r12, r12, #1                           \n"
+-                 "       bne        1b                                     \n"
+-                 "       ands       %[count], %[count], #0x1f              \n"
+-                 "       beq        6f                                     \n"
+-                 "2:                                                       \n"
+-                 "       cmp        %[count], #16                          \n"
+-                 "       blt        3f                                     \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #16                \n"
+-                 "       beq        6f                                     \n"
+-                 "3:                                                       \n"
+-                 "       cmp        %[count], #8                           \n"
+-                 "       blt        4f                                     \n"
+-                 "       vst1.32    {q8, q9}, [%[dst]]!                    \n"
+-                 "       subs       %[count], %[count], #8                 \n"
+-                 "       beq        6f                                     \n"
+-                 "4:                                                       \n"
+-                 "       cmp        %[count], #4                           \n"
+-                 "       blt        5f                                     \n"
+-                 "       vst1.32    {q8}, [%[dst]]!                        \n"
+-                 "       subs       %[count], %[count], #4                 \n"
+-                 "       beq        6f                                     \n"
+-                 "5:                                                       \n"
+-                 "       cmp        %[count], #0                           \n"
+-                 "       beq        6f                                     \n"
+-                 "       lsls       %[count], #31                          \n"
+-                 "       strmi      %[value], [%[dst]], #4                 \n"
+-                 "       strcs      %[value], [%[dst]], #4                 \n"
+-                 "       strcs      %[value], [%[dst]], #4                 \n"
+-                 "6: @end                                                  \n"
+-                 // Clobbered input registers
+-                 : [dst] "+r" (dst), [count] "+r" (count)
+-                 // Unclobbered input
+-                 : [value] "r" (value)
+-                 // Clobbered registers
+-                 : "q8", "q9", "r12", "cc", "memory"
+-                 );
+-}
diff --git a/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch b/recipes/xorg-driver/xf86-video-msm/no_neon_flags.patch
new file mode 100644 (file)
index 0000000..97ad380
--- /dev/null
@@ -0,0 +1,36 @@
+commit 18515a56822fcd9c0a71240edce97ea5623b0448
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date:   Wed Feb 10 16:29:55 2010 +0100
+
+    Modify Makefile.am
+    Removed depencies for neon
+
+diff --git git/src/Makefile.am git/src/Makefile.am
+index 8ab1856..08da5a5 100755
+--- a/src/Makefile.am
++++ b/src/Makefile.am
+@@ -12,13 +12,7 @@ MSM_DRI_SRCS += msm-drm.c msm-dri2.c
+ msm_drv_la_LIBADD += $(DRI2_LIBS)
+ endif
+-NEON_CFLAGS=-march=armv7-a -mfpu=neon -mfloat-abi=softfp
+-NEON_CCASFLAGS=$(NEON_CFLAGS) -mthumb-interwork
+-NEON_ASFLAGS=-k -mcpu=cortex-a8 $(NEON_CCASFLAGS)
+-
+-AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ $(NEON_CFLAGS) -Wall -Werror
+-AM_ASFLAGS = $(NEON_ASFLAGS)
+-AM_CCASFLAGS = $(NEON_CCASFLAGS)
++AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @DRI2_CFLAGS@ -Wall -Werror
+ msm_drv_la_LTLIBRARIES = msm_drv.la
+ msm_drv_la_LDFLAGS = -module -avoid-version
+@@ -37,9 +31,6 @@ msm_drv_la_SOURCES = \
+        msm-swfill.c \
+        msm-hwrender.c \
+        msm-pixmap.c \
+-       neon_memsets.c \
+-       neon_memcpy.S \
+-       neon_memmove.S \
+       $(MSM_DRI_SRCS)
diff --git a/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch b/recipes/xorg-driver/xf86-video-msm/renaming_variables.patch
new file mode 100644 (file)
index 0000000..90dd31f
--- /dev/null
@@ -0,0 +1,116 @@
+commit cc83ba5835d5b55347fd0c0775156493b0cf3a15
+Author: David Lanzendörfer <david.lanzendoerfer@o2s.ch>
+Date:   Thu Feb 11 16:26:52 2010 +0100
+
+    Renaming variables for getting Xorg (xf86-video-msm) work
+    under linux-leviathan (htcdream):
+    cd src
+    sed 's/fixed_info/fix/' -i *.h
+    sed 's/fixed_info/fix/' -i *.c
+
+diff --git git/src/msm-dri.c git/src/msm-dri.c
+index a51d3bd..a74368b 100644
+--- git/src/msm-dri.c
++++ git/src/msm-dri.c
+@@ -151,10 +151,10 @@ MSMDRIScreenInit(ScreenPtr pScreen)
+    pDRIInfo->ddxDriverMinorVersion = 0;
+    pDRIInfo->ddxDriverPatchVersion = 0;
+-   pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fixed_info.smem_start;
++   pDRIInfo->frameBufferPhysicalAddress = (void *)pMsm->fix.smem_start;
+-   pDRIInfo->frameBufferSize = pMsm->fixed_info.smem_len;
+-   pDRIInfo->frameBufferStride = pMsm->fixed_info.line_length;
++   pDRIInfo->frameBufferSize = pMsm->fix.smem_len;
++   pDRIInfo->frameBufferStride = pMsm->fix.line_length;
+    /* FIXME: How many drawables can we do (should we do)? */
+diff --git git/src/msm-driver.c git/src/msm-driver.c
+index 803197f..15378f8 100755
+--- git/src/msm-driver.c
++++ git/src/msm-driver.c
+@@ -399,7 +399,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+    /* Get the fixed info (par) structure */
+-   if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fixed_info)) {
++   if (ioctl(pMsm->fd, FBIOGET_FSCREENINFO, &pMsm->fix)) {
+       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+                "Unable to read hardware info from %s: %s\n",
+                dev, strerror(errno));
+@@ -410,7 +410,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+    /* Parse the ID and figure out what version of the MDP and what
+     * panel ID we have */
+-   if (sscanf(pMsm->fixed_info.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
++   if (sscanf(pMsm->fix.id, "msmfb%d_%x", &mdpver, &panelid) < 2) {
+       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+                "Unable to determine the MDP and panel type\n");
+@@ -435,7 +435,7 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+     * the fbdev driver to allocate memory.   In the mean time, we
+     * just reuse the framebuffer memory */
+-   pScrn->videoRam = pMsm->fixed_info.smem_len;
++   pScrn->videoRam = pMsm->fix.smem_len;
+    /* Get the current screen setting */
+    if (ioctl(pMsm->fd, FBIOGET_VSCREENINFO, &pMsm->mode_info)) {
+@@ -671,8 +671,8 @@ MSMPreInit(ScrnInfoPtr pScrn, int flags)
+    /* The framebuffer driver should always report the line length,
+     * but in case it doesn't, we can calculate it ourselves */
+-   if (pMsm->fixed_info.line_length) {
+-      pScrn->displayWidth = pMsm->fixed_info.line_length;
++   if (pMsm->fix.line_length) {
++      pScrn->displayWidth = pMsm->fix.line_length;
+    } else {
+       pScrn->displayWidth = pMsm->mode_info.xres_virtual *
+           pMsm->mode_info.bits_per_pixel / 8;
+@@ -811,7 +811,7 @@ MSMCloseScreen(int scrnIndex, ScreenPtr pScreen)
+ #endif
+    /* Unmap the framebuffer memory */
+-   munmap(pMsm->fbmem, pMsm->fixed_info.smem_len);
++   munmap(pMsm->fbmem, pMsm->fix.smem_len);
+    pScreen->CloseScreen = pMsm->CloseScreen;
+@@ -857,7 +857,7 @@ MSMScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
+ #endif // defined (MSMFB_GET_PAGE_PROTECTION) && defined (MSMFB_SET_PAGE_PROTECTION)
+    /* Map the framebuffer memory */
+-   pMsm->fbmem = mmap(NULL, pMsm->fixed_info.smem_len,
++   pMsm->fbmem = mmap(NULL, pMsm->fix.smem_len,
+                     PROT_READ | PROT_WRITE, MAP_SHARED, pMsm->fd, 0);
+    /* If we can't map the memory, then this is a short trip */
+diff --git git/src/msm-exa.c git/src/msm-exa.c
+index 301923f..ce16a93 100755
+--- git/src/msm-exa.c
++++ git/src/msm-exa.c
+@@ -740,8 +740,8 @@ MSMSetupExa(ScreenPtr pScreen)
+    pExa->flags = EXA_OFFSCREEN_PIXMAPS;
+    pExa->offScreenBase =
+-       (pMsm->fixed_info.line_length * pMsm->mode_info.yres);
+-   pExa->memorySize = pMsm->fixed_info.smem_len;
++       (pMsm->fix.line_length * pMsm->mode_info.yres);
++   pExa->memorySize = pMsm->fix.smem_len;
+    /* Align pixmap offsets along page boundaries */
+    pExa->pixmapOffsetAlign = 4096;
+diff --git git/src/msm.h git/src/msm.h
+index e1e2bc7..520d390 100755
+--- git/src/msm.h
++++ git/src/msm.h
+@@ -85,7 +85,7 @@ typedef struct _MSMRec
+    int fd;
+    /* Fixed and var strutures from the framebuffer */
+-   struct fb_fix_screeninfo fixed_info;
++   struct fb_fix_screeninfo fix;
+    struct fb_var_screeninfo mode_info;
+    /* Pointer to the mapped framebuffer memory */
index faccb87..4723b86 100644 (file)
@@ -8,9 +8,15 @@ SRCREV = "5f7df59155ae301a3ebc40aec22ed16d203cb5fc"
 PV = "1.1.0+${PR}+gitr${SRCREV}"
 PE = "1"
 
-SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git\
-          "
+SRC_URI = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git"
+SRC_URI_htcdream = "git://codeaurora.org/quic/xwin/xf86-video-msm.git;protocol=git \
+       file://no_neon.patch;patch=1 \
+       file://no_neon_flags.patch;patch=1 \
+       file://renaming_variables.patch;patch=1"
 
 S = "${WORKDIR}/git"
 
 CFLAGS += " -I${STAGING_INCDIR}/xorg "
+CFLAGS += " -Wno-error "
+
+ARM_INSTRUCTION_SET="arm"