ARM: NEON 2xN chroma MC

Originally committed as revision 20696 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Måns Rullgård <mans@mansr.com> 2009-12-02 00:37:36 +0000
committer: Måns Rullgård <mans@mansr.com> 2009-12-02 00:37:36 +0000
commit: 1025d19dd7b53631c77a66c9057fbf1f417fc769 (patch)
tree: c00415d3fbe5bbdbb32f61faf7e0e801324272a5
parent: 04e7f6d2d01cba4c7fb2ad84b13819fa3e4e1425 (diff)
download: ffmpeg-1025d19dd7b53631c77a66c9057fbf1f417fc769.tar.gz
2 files changed, 74 insertions, 0 deletions
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 177497e020..920b2e87c9 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -125,9 +125,11 @@ void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
 
 void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
 
 void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
 
 void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
                                      int beta, int8_t *tc0);
@@ -272,9 +274,11 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
     if (CONFIG_H264_DECODER) {
         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
 
         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
 
         c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
         c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 5c54fa3db1..08ff20702f 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -320,6 +320,74 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         .endfunc
         .endm
 
+        .macro  h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+        push            {r4-r6, lr}
+        ldr             r4,  [sp, #16]
+        ldr             lr,  [sp, #20]
+        pld             [r1]
+        pld             [r1, r2]
+        orrs            r5,  r4,  lr
+        beq             2f
+
+        mul             r5,  r4,  lr
+        rsb             r6,  r5,  lr,  lsl #3
+        rsb             r12, r5,  r4,  lsl #3
+        sub             r4,  r5,  r4,  lsl #3
+        sub             r4,  r4,  lr,  lsl #3
+        add             r4,  r4,  #64
+        vdup.8          d0,  r4
+        vdup.8          d2,  r12
+        vdup.8          d1,  r6
+        vdup.8          d3,  r5
+        vtrn.16         q0,  q1
+1:
+        vld1.32         {d4[0]},  [r1], r2
+        vld1.32         {d4[1]},  [r1], r2
+        vrev64.32       d5,  d4
+        vld1.32         {d5[1]},  [r1]
+        vext.8          q3,  q2,  q2,  #1
+        vtrn.16         q2,  q3
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+.ifc \type,avg
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+.endif
+        vtrn.32         d16, d17
+        vadd.i16        d16, d16, d17
+        vrshrn.u16      d16, q8,  #6
+.ifc \type,avg
+        vrhadd.u8       d16, d16, d18
+.endif
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+        subs            r3,  r3,  #2
+        bgt             1b
+        pop             {r4-r6, pc}
+2:
+.ifc \type,put
+        ldrh            r5,  [r1], r2
+        strh            r5,  [r0], r2
+        ldrh            r6,  [r1], r2
+        strh            r6,  [r0], r2
+.else
+        vld1.16         {d16[0]}, [r1], r2
+        vld1.16         {d16[1]}, [r1], r2
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+        vrhadd.u8       d16, d16, d18
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+.endif
+        subs            r3,  r3,  #2
+        bgt             2b
+        pop             {r4-r6, pc}
+        .endfunc
+.endm
+
         .text
         .align
 
@@ -327,6 +395,8 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         h264_chroma_mc8 avg
         h264_chroma_mc4 put
         h264_chroma_mc4 avg
+        h264_chroma_mc2 put
+        h264_chroma_mc2 avg
 
         /* H.264 loop filter */
author	Måns Rullgård <mans@mansr.com>	2009-12-02 00:37:36 +0000
committer	Måns Rullgård <mans@mansr.com>	2009-12-02 00:37:36 +0000
commit	1025d19dd7b53631c77a66c9057fbf1f417fc769 (patch)
tree	c00415d3fbe5bbdbb32f61faf7e0e801324272a5
parent	04e7f6d2d01cba4c7fb2ad84b13819fa3e4e1425 (diff)
download	ffmpeg-1025d19dd7b53631c77a66c9057fbf1f417fc769.tar.gz