recipes/xorg-driver/xf86-video-omapfb/omapfb-neon.diff

   1 --- /tmp/image-format-conversions.h     2009-02-03 10:18:04.000000000 +0100
   2 +++ git/src/image-format-conversions.h  2009-02-03 10:19:18.000000000 +0100
   3 @@ -30,6 +30,8 @@
   4  /* Basic C implementation of YV12/I420 to UYVY conversion */
   5  void uv12_to_uyvy(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
   6
   7 +/* NEON implementation of YV12/I420 to UYVY conversion */
   8 +void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
   9
  10  #endif /* __IMAGE_FORMAT_CONVERSIONS_H__ */
  11
  12 --- /tmp/image-format-conversions.c     2009-02-03 10:18:04.000000000 +0100
  13 +++ git/src/image-format-conversions.c  2009-02-03 10:16:47.000000000 +0100
  14 @@ -2,6 +2,7 @@
  15   * Copyright 2008 Kalle Vahlman, <zuh@iki.fi>
  16   *                Ilpo Ruotsalainen, <lonewolf@iki.fi>
  17   *                Tuomas Kulve, <tuomas.kulve@movial.com>
  18 + *                Ian Rickards, <ian.rickards@arm.com>
  19   *
  20   *
  21   * Permission to use, copy, modify, distribute and sell this software and its
  22 @@ -89,3 +90,104 @@
  23         }
  24  }
  25
  26 +void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest)
  27 +{
  28 +    int x, y;
  29 +    uint8_t *dest_even = dest;
  30 +    uint8_t *dest_odd = dest + w * 2;
  31 +    uint8_t *y_p_even = y_p;
  32 +    uint8_t *y_p_odd = y_p + y_pitch;
  33 +
  34 +    /*ErrorF("in uv12_to_uyvy, w: %d, pitch: %d\n", w, pitch);*/
  35 +    if (w<16)
  36 +    {
  37 +        for (y=0; y<h; y+=2)
  38 +        {
  39 +            for (x=0; x<w; x+=2)
  40 +            {
  41 +                /* Output two 2x1 macroblocks to form a 2x2 block from input */
  42 +                uint8_t u_val = *u_p++;
  43 +                uint8_t v_val = *v_p++;
  44 +
  45 +                /* Even row, first pixel */
  46 +                *dest_even++ = u_val;
  47 +                *dest_even++ = *y_p_even++;
  48 +
  49 +                /* Even row, second pixel */
  50 +                *dest_even++ = v_val;
  51 +                *dest_even++ = *y_p_even++;
  52 +
  53 +                /* Odd row, first pixel */
  54 +                *dest_odd++ = u_val;
  55 +                *dest_odd++ = *y_p_odd++;
  56 +
  57 +                /* Odd row, second pixel */
  58 +                *dest_odd++ = v_val;
  59 +                *dest_odd++ = *y_p_odd++;
  60 +            }
  61 +
  62 +            dest_even += w * 2;
  63 +            dest_odd += w * 2;
  64 +
  65 +            u_p += ((uv_pitch << 1) - w) >> 1;
  66 +            v_p += ((uv_pitch << 1) - w) >> 1;
  67 +
  68 +            y_p_even += (y_pitch - w) + y_pitch;
  69 +            y_p_odd += (y_pitch - w) + y_pitch;
  70 +        }
  71 +    }
  72 +    else
  73 +    {
  74 +        for (y=0; y<h; y+=2)
  75 +        {
  76 +            x=w;
  77 +            do {
  78 +                // avoid using d8-d15 (q4-q7) aapcs callee-save registers
  79 +                asm volatile (
  80 +                        "1:\n\t"
  81 +                        "vld1.u8   {d0}, [%[u_p]]!\n\t"
  82 +                        "sub       %[x],%[x],#16\n\t"
  83 +                        "cmp       %[x],#16\n\t"
  84 +                        "vld1.u8   {d1}, [%[v_p]]!\n\t"
  85 +                        "vld1.u8   {q1}, [%[y_p_even]]!\n\t"
  86 +                        "vzip.u8   d0, d1\n\t"
  87 +                        "vld1.u8   {q2}, [%[y_p_odd]]!\n\t"
  88 +                // use 2-element struct stores to zip up y with y&v
  89 +                        "vst2.u8   {q0,q1}, [%[dest_even]]!\n\t"
  90 +                        "vmov.u8   q1, q2\n\t"
  91 +                        "vst2.u8   {q0,q1}, [%[dest_odd]]!\n\t"
  92 +                        "bhs       1b\n\t"
  93 +                        : [u_p] "+r" (u_p), [v_p] "+r" (v_p), [y_p_even] "+r" (y_p_even), [y_p_odd] "+r" (y_p_odd),
  94 +                          [dest_even] "+r" (dest_even), [dest_odd] "+r" (dest_odd),
  95 +                          [x] "+r" (x)
  96 +                        :
  97 +                        : "cc", "memory", "d0","d1","d2","d3","d4","d5"
  98 +                        );
  99 +                if (x!=0)
 100 +                {
 101 +                    // overlap final 16-pixel block to process requested width exactly
 102 +                    x = 16-x;
 103 +                    u_p -= x/2;
 104 +                    v_p -= x/2;
 105 +                    y_p_even -= x;
 106 +                    y_p_odd -= x;
 107 +                    dest_even -= x*2;
 108 +                    dest_odd -= x*2;
 109 +                    x = 16;
 110 +                    // do another 16-pixel block
 111 +                }
 112 +            }
 113 +            while (x!=0);
 114 +
 115 +            dest_even += w * 2;
 116 +            dest_odd += w * 2;
 117 +
 118 +            u_p += ((uv_pitch << 1) - w) >> 1;
 119 +            v_p += ((uv_pitch << 1) - w) >> 1;
 120 +
 121 +            y_p_even += (y_pitch - w) + y_pitch;
 122 +            y_p_odd += (y_pitch - w) + y_pitch;
 123 +        }
 124 +    }
 125 +}
 126 +
 127 --- /tmp/omapfb-xv-generic.c    2009-02-03 10:52:18.000000000 +0100
 128 +++ git/src/omapfb-xv-generic.c 2009-02-03 10:52:24.000000000 +0100
 129 @@ -240,7 +240,7 @@
 130                         uint8_t *yb = buf;
 131                         uint8_t *ub = yb + (src_y_pitch * src_h);
 132                         uint8_t *vb = ub + (src_uv_pitch * (src_h / 2));
 133 -                       uv12_to_uyvy(src_w & ~15,
 134 +                       uv12_to_uyvy_neon(src_w & ~15,
 135                                      src_h & ~15,
 136                                      src_y_pitch,
 137                                      src_uv_pitch,
 138 @@ -256,7 +256,7 @@
 139                         uint8_t *yb = buf;
 140                         uint8_t *vb = yb + (src_y_pitch * src_h);
 141                         uint8_t *ub = vb + (src_uv_pitch * (src_h / 2));
 142 -                       uv12_to_uyvy(src_w & ~15,
 143 +                       uv12_to_uyvy_neon(src_w & ~15,
 144                                      src_h & ~15,
 145                                      src_y_pitch,
 146                                      src_uv_pitch,