ffmpeg 0.5: sync arm optimizations with current git
authorKoen Kooi <koen@openembedded.org>
Sat, 30 May 2009 09:34:23 +0000 (11:34 +0200)
committerKoen Kooi <koen@openembedded.org>
Sat, 30 May 2009 09:34:23 +0000 (11:34 +0200)
recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff [new file with mode: 0644]
recipes/ffmpeg/ffmpeg_0.5.bb

diff --git a/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff b/recipes/ffmpeg/ffmpeg-0.5/ffmpeg-arm-update.diff
new file mode 100644 (file)
index 0000000..7c72ccd
--- /dev/null
@@ -0,0 +1,495 @@
+ Makefile               |    4 +
+ arm/dsputil_neon.c     |   16 ++++
+ arm/dsputil_neon_s.S   |  178 +++++++++++++++++++++++++++++++++++++------------
+ arm/simple_idct_neon.S |   17 ++++
+ arm/vp3dsp_neon.S      |   94 +++++++++++++++++++++++++
+ 5 files changed, 265 insertions(+), 44 deletions(-)
+diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon.c ffmpeg-0.5/libavcodec/arm/dsputil_neon.c
+--- ffmpeg.old/libavcodec/arm/dsputil_neon.c   2009-01-31 00:13:19.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/dsputil_neon.c   2009-05-30 11:27:54.000000000 +0200
+@@ -41,6 +41,10 @@
+ void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
++void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
++
+ void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
+ void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
+@@ -146,6 +150,9 @@
+                             DCTELEM *block, int stride,
+                             const uint8_t nnzc[6*8]);
++void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
++void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
++
+ void ff_vector_fmul_neon(float *dst, const float *src, int len);
+ void ff_vector_fmul_window_neon(float *dst, const float *src0,
+                                 const float *src1, const float *win,
+@@ -176,6 +183,10 @@
+     c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
++    c->add_pixels_clamped = ff_add_pixels_clamped_neon;
++    c->put_pixels_clamped = ff_put_pixels_clamped_neon;
++    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
++
+     c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+     c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+@@ -247,6 +258,11 @@
+     c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+     c->h264_idct_add8       = ff_h264_idct_add8_neon;
++    if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
++        c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
++        c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
++    }
++
+     c->vector_fmul = ff_vector_fmul_neon;
+     c->vector_fmul_window = ff_vector_fmul_window_neon;
+diff -Nurd ffmpeg.old/libavcodec/arm/dsputil_neon_s.S ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S
+--- ffmpeg.old/libavcodec/arm/dsputil_neon_s.S 2009-01-31 00:13:19.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/dsputil_neon_s.S 2009-05-30 11:27:54.000000000 +0200
+@@ -38,13 +38,13 @@
+         pld             [r1, r2]
+         pld             [r1, r2, lsl #1]
+ .if \avg
+-        vld1.64         {d16,d17}, [ip], r2
++        vld1.64         {d16,d17}, [ip,:128], r2
+         vrhadd.u8       q0,  q0,  q8
+-        vld1.64         {d18,d19}, [ip], r2
++        vld1.64         {d18,d19}, [ip,:128], r2
+         vrhadd.u8       q1,  q1,  q9
+-        vld1.64         {d20,d21}, [ip], r2
++        vld1.64         {d20,d21}, [ip,:128], r2
+         vrhadd.u8       q2,  q2,  q10
+-        vld1.64         {d22,d23}, [ip], r2
++        vld1.64         {d22,d23}, [ip,:128], r2
+         vrhadd.u8       q3,  q3,  q11
+ .endif
+         subs            r3,  r3,  #4
+@@ -73,35 +73,29 @@
+         .endm
+         .macro pixels16_y2 vhadd=vrhadd.u8
+-        push            {lr}
+-        add             ip,  r1,  r2
+-        lsl             lr,  r2,  #1
+-        vld1.64         {d0, d1},  [r1], lr
+-        vld1.64         {d2, d3},  [ip], lr
++        vld1.64         {d0, d1},  [r1], r2
++        vld1.64         {d2, d3},  [r1], r2
+ 1:      subs            r3,  r3,  #2
+         \vhadd          q2,  q0,  q1
+-        vld1.64         {d0, d1},  [r1],      lr
++        vld1.64         {d0, d1},  [r1], r2
+         \vhadd          q3,  q0,  q1
+-        vld1.64         {d2, d3},  [ip],      lr
++        vld1.64         {d2, d3},  [r1], r2
+         pld             [r1]
+-        pld             [ip]
++        pld             [r1, r2]
+         vst1.64         {d4, d5},  [r0,:128], r2
+         vst1.64         {d6, d7},  [r0,:128], r2
+         bne             1b
+-        pop             {pc}
++        bx              lr
+         .endm
+         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
+-        push            {lr}
+-        lsl             lr,  r2,  #1
+-        add             ip,  r1,  r2
+-        vld1.64         {d0-d2},   [r1], lr
+-        vld1.64         {d4-d6},   [ip], lr
++        vld1.64         {d0-d2},   [r1], r2
++        vld1.64         {d4-d6},   [r1], r2
+ .if \no_rnd
+         vmov.i16        q13, #1
+ .endif
+         pld             [r1]
+-        pld             [ip]
++        pld             [r1, r2]
+         vext.8          q1,  q0,  q1,  #1
+         vext.8          q3,  q2,  q3,  #1
+         vaddl.u8        q8,  d0,  d2
+@@ -109,7 +103,7 @@
+         vaddl.u8        q9,  d4,  d6
+         vaddl.u8        q11, d5,  d7
+ 1:      subs            r3,  r3,  #2
+-        vld1.64         {d0-d2},   [r1], lr
++        vld1.64         {d0-d2},   [r1], r2
+         vadd.u16        q12, q8,  q9
+         pld             [r1]
+ .if \no_rnd
+@@ -123,11 +117,11 @@
+ .endif
+         \vshrn          d29, q1,  #2
+         vaddl.u8        q8,  d0,  d30
+-        vld1.64         {d2-d4},   [ip], lr
++        vld1.64         {d2-d4},   [r1], r2
+         vaddl.u8        q10, d1,  d31
+         vst1.64         {d28,d29}, [r0,:128], r2
+         vadd.u16        q12, q8,  q9
+-        pld             [ip]
++        pld             [r1, r2]
+ .if \no_rnd
+         vadd.u16        q12, q12, q13
+ .endif
+@@ -142,7 +136,7 @@
+         vaddl.u8        q11, d3,  d5
+         vst1.64         {d30,d31}, [r0,:128], r2
+         bgt             1b
+-        pop             {pc}
++        bx              lr
+         .endm
+         .macro pixels8
+@@ -180,41 +174,35 @@
+         .endm
+         .macro pixels8_y2 vhadd=vrhadd.u8
+-        push            {lr}
+-        add             ip,  r1,  r2
+-        lsl             lr,  r2,  #1
+-        vld1.64         {d0},      [r1], lr
+-        vld1.64         {d1},      [ip], lr
++        vld1.64         {d0},      [r1], r2
++        vld1.64         {d1},      [r1], r2
+ 1:      subs            r3,  r3,  #2
+         \vhadd          d4,  d0,  d1
+-        vld1.64         {d0},      [r1],     lr
++        vld1.64         {d0},      [r1], r2
+         \vhadd          d5,  d0,  d1
+-        vld1.64         {d1},      [ip],     lr
++        vld1.64         {d1},      [r1], r2
+         pld             [r1]
+-        pld             [ip]
++        pld             [r1, r2]
+         vst1.64         {d4},      [r0,:64], r2
+         vst1.64         {d5},      [r0,:64], r2
+         bne             1b
+-        pop             {pc}
++        bx              lr
+         .endm
+         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
+-        push            {lr}
+-        lsl             lr,  r2,  #1
+-        add             ip,  r1,  r2
+-        vld1.64         {d0, d1},  [r1], lr
+-        vld1.64         {d2, d3},  [ip], lr
++        vld1.64         {d0, d1},  [r1], r2
++        vld1.64         {d2, d3},  [r1], r2
+ .if \no_rnd
+         vmov.i16        q11, #1
+ .endif
+         pld             [r1]
+-        pld             [ip]
++        pld             [r1, r2]
+         vext.8          d4,  d0,  d1,  #1
+         vext.8          d6,  d2,  d3,  #1
+         vaddl.u8        q8,  d0,  d4
+         vaddl.u8        q9,  d2,  d6
+ 1:      subs            r3,  r3,  #2
+-        vld1.64         {d0, d1},  [r1], lr
++        vld1.64         {d0, d1},  [r1], r2
+         pld             [r1]
+         vadd.u16        q10, q8,  q9
+         vext.8          d4,  d0,  d1,  #1
+@@ -223,9 +211,9 @@
+ .endif
+         vaddl.u8        q8,  d0,  d4
+         \vshrn          d5,  q10, #2
+-        vld1.64         {d2, d3},  [ip], lr
++        vld1.64         {d2, d3},  [r1], r2
+         vadd.u16        q10, q8,  q9
+-        pld             [ip]
++        pld             [r1, r2]
+ .if \no_rnd
+         vadd.u16        q10, q10, q11
+ .endif
+@@ -235,7 +223,7 @@
+         vaddl.u8        q9,  d2,  d6
+         vst1.64         {d7},      [r0,:64], r2
+         bgt             1b
+-        pop             {pc}
++        bx              lr
+         .endm
+         .macro pixfunc pfx name suf rnd_op args:vararg
+@@ -273,6 +261,112 @@
+         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
+         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
++function ff_put_pixels_clamped_neon, export=1
++        vld1.64         {d16-d19}, [r0,:128]!
++        vqmovun.s16     d0, q8
++        vld1.64         {d20-d23}, [r0,:128]!
++        vqmovun.s16     d1, q9
++        vld1.64         {d24-d27}, [r0,:128]!
++        vqmovun.s16     d2, q10
++        vld1.64         {d28-d31}, [r0,:128]!
++        vqmovun.s16     d3, q11
++        vst1.64         {d0},      [r1,:64], r2
++        vqmovun.s16     d4, q12
++        vst1.64         {d1},      [r1,:64], r2
++        vqmovun.s16     d5, q13
++        vst1.64         {d2},      [r1,:64], r2
++        vqmovun.s16     d6, q14
++        vst1.64         {d3},      [r1,:64], r2
++        vqmovun.s16     d7, q15
++        vst1.64         {d4},      [r1,:64], r2
++        vst1.64         {d5},      [r1,:64], r2
++        vst1.64         {d6},      [r1,:64], r2
++        vst1.64         {d7},      [r1,:64], r2
++        bx              lr
++        .endfunc
++
++function ff_put_signed_pixels_clamped_neon, export=1
++        vmov.u8         d31, #128
++        vld1.64         {d16-d17}, [r0,:128]!
++        vqmovn.s16      d0, q8
++        vld1.64         {d18-d19}, [r0,:128]!
++        vqmovn.s16      d1, q9
++        vld1.64         {d16-d17}, [r0,:128]!
++        vqmovn.s16      d2, q8
++        vld1.64         {d18-d19}, [r0,:128]!
++        vadd.u8         d0, d0, d31
++        vld1.64         {d20-d21}, [r0,:128]!
++        vadd.u8         d1, d1, d31
++        vld1.64         {d22-d23}, [r0,:128]!
++        vadd.u8         d2, d2, d31
++        vst1.64         {d0},      [r1,:64], r2
++        vqmovn.s16      d3, q9
++        vst1.64         {d1},      [r1,:64], r2
++        vqmovn.s16      d4, q10
++        vst1.64         {d2},      [r1,:64], r2
++        vqmovn.s16      d5, q11
++        vld1.64         {d24-d25}, [r0,:128]!
++        vadd.u8         d3, d3, d31
++        vld1.64         {d26-d27}, [r0,:128]!
++        vadd.u8         d4, d4, d31
++        vadd.u8         d5, d5, d31
++        vst1.64         {d3},      [r1,:64], r2
++        vqmovn.s16      d6, q12
++        vst1.64         {d4},      [r1,:64], r2
++        vqmovn.s16      d7, q13
++        vst1.64         {d5},      [r1,:64], r2
++        vadd.u8         d6, d6, d31
++        vadd.u8         d7, d7, d31
++        vst1.64         {d6},      [r1,:64], r2
++        vst1.64         {d7},      [r1,:64], r2
++        bx              lr
++        .endfunc
++
++function ff_add_pixels_clamped_neon, export=1
++        mov             r3, r1
++        vld1.64         {d16},   [r1,:64], r2
++        vld1.64         {d0-d1}, [r0,:128]!
++        vaddw.u8        q0, q0, d16
++        vld1.64         {d17},   [r1,:64], r2
++        vld1.64         {d2-d3}, [r0,:128]!
++        vqmovun.s16     d0, q0
++        vld1.64         {d18},   [r1,:64], r2
++        vaddw.u8        q1, q1, d17
++        vld1.64         {d4-d5}, [r0,:128]!
++        vaddw.u8        q2, q2, d18
++        vst1.64         {d0},    [r3,:64], r2
++        vqmovun.s16     d2, q1
++        vld1.64         {d19},   [r1,:64], r2
++        vld1.64         {d6-d7}, [r0,:128]!
++        vaddw.u8        q3, q3, d19
++        vqmovun.s16     d4, q2
++        vst1.64         {d2},    [r3,:64], r2
++        vld1.64         {d16},   [r1,:64], r2
++        vqmovun.s16     d6, q3
++        vld1.64         {d0-d1}, [r0,:128]!
++        vaddw.u8        q0, q0, d16
++        vst1.64         {d4},    [r3,:64], r2
++        vld1.64         {d17},   [r1,:64], r2
++        vld1.64         {d2-d3}, [r0,:128]!
++        vaddw.u8        q1, q1, d17
++        vst1.64         {d6},    [r3,:64], r2
++        vqmovun.s16     d0, q0
++        vld1.64         {d18},   [r1,:64], r2
++        vld1.64         {d4-d5}, [r0,:128]!
++        vaddw.u8        q2, q2, d18
++        vst1.64         {d0},    [r3,:64], r2
++        vqmovun.s16     d2, q1
++        vld1.64         {d19},   [r1,:64], r2
++        vqmovun.s16     d4, q2
++        vld1.64         {d6-d7}, [r0,:128]!
++        vaddw.u8        q3, q3, d19
++        vst1.64         {d2},    [r3,:64], r2
++        vqmovun.s16     d6, q3
++        vst1.64         {d4},    [r3,:64], r2
++        vst1.64         {d6},    [r3,:64], r2
++        bx              lr
++        .endfunc
++
+ function ff_float_to_int16_neon, export=1
+         subs            r2,  r2,  #8
+         vld1.64         {d0-d1},  [r1,:128]!
+diff -Nurd ffmpeg.old/libavcodec/arm/simple_idct_neon.S ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S
+--- ffmpeg.old/libavcodec/arm/simple_idct_neon.S       2008-12-30 04:13:52.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/simple_idct_neon.S       2009-05-30 11:27:54.000000000 +0200
+@@ -68,6 +68,19 @@
+         .text
+         .align 6
++function idct_row4_pld_neon
++        pld             [r0]
++        add             r3,  r0,  r1,  lsl #2
++        pld             [r0, r1]
++        pld             [r0, r1, lsl #1]
++        pld             [r3, -r1]
++        pld             [r3]
++        pld             [r3, r1]
++        add             r3,  r3,  r1,  lsl #1
++        pld             [r3]
++        pld             [r3, r1]
++        .endfunc
++
+ function idct_row4_neon
+         vmov.i32        q15, #(1<<(ROW_SHIFT-1))
+         vld1.64         {d2-d5},  [r2,:128]!
+@@ -252,7 +265,7 @@
+ function ff_simple_idct_put_neon, export=1
+         idct_start      r2
+-        bl              idct_row4_neon
++        bl              idct_row4_pld_neon
+         bl              idct_row4_neon
+         add             r2,  r2,  #-128
+         bl              idct_col4_neon
+@@ -307,7 +320,7 @@
+ function ff_simple_idct_add_neon, export=1
+         idct_start      r2
+-        bl              idct_row4_neon
++        bl              idct_row4_pld_neon
+         bl              idct_row4_neon
+         add             r2,  r2,  #-128
+         bl              idct_col4_neon
+diff -Nurd ffmpeg.old/libavcodec/arm/vp3dsp_neon.S ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S
+--- ffmpeg.old/libavcodec/arm/vp3dsp_neon.S    1970-01-01 01:00:00.000000000 +0100
++++ ffmpeg-0.5/libavcodec/arm/vp3dsp_neon.S    2009-05-30 11:27:54.000000000 +0200
+@@ -0,0 +1,94 @@
++/*
++ * Copyright (c) 2009 David Conrad
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "asm.S"
++
++.macro vp3_loop_filter
++    vsubl.u8        q3,  d18, d17
++    vsubl.u8        q2,  d16, d19
++    vadd.i16        q1,  q3,  q3
++    vadd.i16        q2,  q2,  q3
++    vadd.i16        q0,  q1,  q2
++    vrshr.s16       q0,  q0,  #3
++    vmovl.u8        q9,  d18
++    vdup.u16        q15, r2
++
++    vabs.s16        q1,  q0
++    vshr.s16        q0,  q0,  #15
++    vqsub.u16       q2,  q15, q1
++    vqsub.u16       q3,  q2,  q1
++    vsub.i16        q1,  q2,  q3
++    veor            q1,  q1,  q0
++    vsub.i16        q0,  q1,  q0
++
++    vaddw.u8        q2,  q0,  d17
++    vsub.i16        q3,  q9,  q0
++    vqmovun.s16     d0,  q2
++    vqmovun.s16     d1,  q3
++.endm
++
++function ff_vp3_v_loop_filter_neon, export=1
++    sub             ip,  r0,  r1
++    sub             r0,  r0,  r1,  lsl #1
++    vld1.64         {d16}, [r0,:64], r1
++    vld1.64         {d17}, [r0,:64], r1
++    vld1.64         {d18}, [r0,:64], r1
++    vld1.64         {d19}, [r0,:64], r1
++    ldrb            r2,    [r2, #129*4]
++
++    vp3_loop_filter
++
++    vst1.64         {d0},  [ip,:64], r1
++    vst1.64         {d1},  [ip,:64], r1
++    bx              lr
++.endfunc
++
++function ff_vp3_h_loop_filter_neon, export=1
++    sub             ip,  r0,  #1
++    sub             r0,  r0,  #2
++    vld1.32         {d16[]},  [r0], r1
++    vld1.32         {d17[]},  [r0], r1
++    vld1.32         {d18[]},  [r0], r1
++    vld1.32         {d19[]},  [r0], r1
++    vld1.32         {d16[1]}, [r0], r1
++    vld1.32         {d17[1]}, [r0], r1
++    vld1.32         {d18[1]}, [r0], r1
++    vld1.32         {d19[1]}, [r0], r1
++    ldrb            r2,  [r2, #129*4]
++
++    vtrn.8          d16, d17
++    vtrn.8          d18, d19
++    vtrn.16         d16, d18
++    vtrn.16         d17, d19
++
++    vp3_loop_filter
++
++    vtrn.8          d0,  d1
++
++    vst1.16         {d0[0]}, [ip], r1
++    vst1.16         {d1[0]}, [ip], r1
++    vst1.16         {d0[1]}, [ip], r1
++    vst1.16         {d1[1]}, [ip], r1
++    vst1.16         {d0[2]}, [ip], r1
++    vst1.16         {d1[2]}, [ip], r1
++    vst1.16         {d0[3]}, [ip], r1
++    vst1.16         {d1[3]}, [ip], r1
++    bx              lr
++.endfunc
+diff -Nurd ffmpeg.old/libavcodec/Makefile ffmpeg-0.5/libavcodec/Makefile
+--- ffmpeg.old/libavcodec/Makefile     2009-02-26 03:29:24.000000000 +0100
++++ ffmpeg-0.5/libavcodec/Makefile     2009-05-30 11:29:51.000000000 +0200
+@@ -477,11 +477,15 @@
+ OBJS-$(HAVE_IWMMXT)                    += arm/dsputil_iwmmxt.o          \
+                                           arm/mpegvideo_iwmmxt.o        \
++NEON-OBJS-$(CONFIG_THEORA_DECODER)     += arm/vp3dsp_neon.o
++NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o
++
+ OBJS-$(HAVE_NEON)                      += arm/dsputil_neon.o            \
+                                           arm/dsputil_neon_s.o          \
+                                           arm/h264dsp_neon.o            \
+                                           arm/h264idct_neon.o           \
+                                           arm/simple_idct_neon.o        \
++                                          $(NEON-OBJS-yes)
+ OBJS-$(ARCH_BFIN)                      += bfin/dsputil_bfin.o           \
+                                           bfin/fdct_bfin.o              \
index 54db004..63549b7 100644 (file)
@@ -3,13 +3,14 @@ require ffmpeg.inc
 DEPENDS += "schroedinger libgsm"
 
 PE = "1"
-PR = "r1"
+PR = "r2"
 
 DEFAULT_PREFERENCE = "1"
 
 SRCREV_libswscale = "b2e1c8222eeef74b0ca8053b400957dd69e18e4d"
 SRC_URI = "http://ffmpeg.org/releases/ffmpeg-${PV}.tar.bz2 \
           file://armv4.patch;patch=1 \
+       file://ffmpeg-arm-update.diff;patch=1 \
          "
 
 #S = "${WORKDIR}/git"