1 --- /tmp/image-format-conversions.h 2009-02-03 10:18:04.000000000 +0100
2 +++ git/src/image-format-conversions.h 2009-02-03 10:19:18.000000000 +0100
4 /* Basic C implementation of YV12/I420 to UYVY conversion */
5 void uv12_to_uyvy(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
7 +/* NEON implementation of YV12/I420 to UYVY conversion */
8 +void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest);
10 #endif /* __IMAGE_FORMAT_CONVERSIONS_H__ */
12 --- /tmp/image-format-conversions.c 2009-02-03 10:18:04.000000000 +0100
13 +++ git/src/image-format-conversions.c 2009-02-03 10:16:47.000000000 +0100
15 * Copyright 2008 Kalle Vahlman, <zuh@iki.fi>
16 * Ilpo Ruotsalainen, <lonewolf@iki.fi>
17 * Tuomas Kulve, <tuomas.kulve@movial.com>
18 + * Ian Rickards, <ian.rickards@arm.com>
21 * Permission to use, copy, modify, distribute and sell this software and its
26 +void uv12_to_uyvy_neon(int w, int h, int y_pitch, int uv_pitch, uint8_t *y_p, uint8_t *u_p, uint8_t *v_p, uint8_t *dest)
29 + uint8_t *dest_even = dest;
30 + uint8_t *dest_odd = dest + w * 2;
31 + uint8_t *y_p_even = y_p;
32 + uint8_t *y_p_odd = y_p + y_pitch;
34 + /*ErrorF("in uv12_to_uyvy, w: %d, pitch: %d\n", w, pitch);*/
37 + for (y=0; y<h; y+=2)
39 + for (x=0; x<w; x+=2)
41 + /* Output two 2x1 macroblocks to form a 2x2 block from input */
42 + uint8_t u_val = *u_p++;
43 + uint8_t v_val = *v_p++;
45 + /* Even row, first pixel */
46 + *dest_even++ = u_val;
47 + *dest_even++ = *y_p_even++;
49 + /* Even row, second pixel */
50 + *dest_even++ = v_val;
51 + *dest_even++ = *y_p_even++;
53 + /* Odd row, first pixel */
54 + *dest_odd++ = u_val;
55 + *dest_odd++ = *y_p_odd++;
57 + /* Odd row, second pixel */
58 + *dest_odd++ = v_val;
59 + *dest_odd++ = *y_p_odd++;
65 + u_p += ((uv_pitch << 1) - w) >> 1;
66 + v_p += ((uv_pitch << 1) - w) >> 1;
68 + y_p_even += (y_pitch - w) + y_pitch;
69 + y_p_odd += (y_pitch - w) + y_pitch;
74 + for (y=0; y<h; y+=2)
78 + // avoid using d8-d15 (q4-q7) aapcs callee-save registers
81 + "vld1.u8 {d0}, [%[u_p]]!\n\t"
82 + "sub %[x],%[x],#16\n\t"
84 + "vld1.u8 {d1}, [%[v_p]]!\n\t"
85 + "vld1.u8 {q1}, [%[y_p_even]]!\n\t"
86 + "vzip.u8 d0, d1\n\t"
87 + "vld1.u8 {q2}, [%[y_p_odd]]!\n\t"
88 + // use 2-element struct stores to zip up y with y&v
89 + "vst2.u8 {q0,q1}, [%[dest_even]]!\n\t"
90 + "vmov.u8 q1, q2\n\t"
91 + "vst2.u8 {q0,q1}, [%[dest_odd]]!\n\t"
93 + : [u_p] "+r" (u_p), [v_p] "+r" (v_p), [y_p_even] "+r" (y_p_even), [y_p_odd] "+r" (y_p_odd),
94 + [dest_even] "+r" (dest_even), [dest_odd] "+r" (dest_odd),
97 + : "cc", "memory", "d0","d1","d2","d3","d4","d5"
101 + // overlap final 16-pixel block to process requested width exactly
110 + // do another 16-pixel block
115 + dest_even += w * 2;
118 + u_p += ((uv_pitch << 1) - w) >> 1;
119 + v_p += ((uv_pitch << 1) - w) >> 1;
121 + y_p_even += (y_pitch - w) + y_pitch;
122 + y_p_odd += (y_pitch - w) + y_pitch;
127 --- /tmp/omapfb-xv-generic.c 2009-02-03 10:52:18.000000000 +0100
128 +++ git/src/omapfb-xv-generic.c 2009-02-03 10:52:24.000000000 +0100
131 uint8_t *ub = yb + (src_y_pitch * src_h);
132 uint8_t *vb = ub + (src_uv_pitch * (src_h / 2));
133 - uv12_to_uyvy(src_w & ~15,
134 + uv12_to_uyvy_neon(src_w & ~15,
140 uint8_t *vb = yb + (src_y_pitch * src_h);
141 uint8_t *ub = vb + (src_uv_pitch * (src_h / 2));
142 - uv12_to_uyvy(src_w & ~15,
143 + uv12_to_uyvy_neon(src_w & ~15,