vp3: DC-only IDCT

2-4% faster overall decode Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: David Conrad <lessen42@gmail.com> 2010-04-17 02:04:30 +0000
committer: David Conrad <lessen42@gmail.com> 2010-04-17 02:04:30 +0000
commit: eb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch)
tree: 23225d7976eefaf0292342e6ee8b4ac946efcb8e
parent: f32f7d8b24d1228df447be85046b9346292d936e (diff)
download: ffmpeg-eb6a6cd788a172f146534c5fab9b98d6cbf59520.tar.gz
9 files changed, 118 insertions, 3 deletions
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 1f2169ead5..0e44160392 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -32,6 +32,7 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
 void ff_vp3_idct_neon(DCTELEM *data);
 void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
 void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
+void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data);
 
 void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
 void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
@@ -294,6 +295,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
     if (CONFIG_VP3_DECODER) {
         c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
         c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
+        c->vp3_idct_dc_add   = ff_vp3_idct_dc_add_neon;
     }
 
     c->vector_fmul                = ff_vector_fmul_neon;
diff --git a/libavcodec/arm/vp3dsp_neon.S b/libavcodec/arm/vp3dsp_neon.S
index 6deae4725e..ade19984c2 100644
--- a/libavcodec/arm/vp3dsp_neon.S
+++ b/libavcodec/arm/vp3dsp_neon.S
@@ -374,3 +374,47 @@ function ff_vp3_idct_add_neon, export=1
     vst1.64         {d7}, [r2,:64], r1
     bx              lr
 endfunc
+
+function ff_vp3_idct_dc_add_neon, export=1
+    ldrsh           r2,  [r2]
+    movw            r3,  #46341
+    mul             r2,  r3,  r2
+    smulwt          r2,  r3,  r2
+    mov             r3,  r0
+    vdup.16         q15, r2
+    vrshr.s16       q15, q15, #4
+
+    vld1.8          {d0}, [r0,:64], r1
+    vld1.8          {d1}, [r0,:64], r1
+    vld1.8          {d2}, [r0,:64], r1
+    vaddw.u8        q8,  q15, d0
+    vld1.8          {d3}, [r0,:64], r1
+    vaddw.u8        q9,  q15, d1
+    vld1.8          {d4}, [r0,:64], r1
+    vaddw.u8        q10, q15, d2
+    vld1.8          {d5}, [r0,:64], r1
+    vaddw.u8        q11, q15, d3
+    vld1.8          {d6}, [r0,:64], r1
+    vaddw.u8        q12, q15, d4
+    vld1.8          {d7}, [r0,:64], r1
+    vaddw.u8        q13, q15, d5
+    vqmovun.s16     d0,  q8
+    vaddw.u8        q14, q15, d6
+    vqmovun.s16     d1,  q9
+    vaddw.u8        q15, q15, d7
+    vqmovun.s16     d2,  q10
+    vst1.8          {d0}, [r3,:64], r1
+    vqmovun.s16     d3,  q11
+    vst1.8          {d1}, [r3,:64], r1
+    vqmovun.s16     d4,  q12
+    vst1.8          {d2}, [r3,:64], r1
+    vqmovun.s16     d5,  q13
+    vst1.8          {d3}, [r3,:64], r1
+    vqmovun.s16     d6,  q14
+    vst1.8          {d4}, [r3,:64], r1
+    vqmovun.s16     d7,  q15
+    vst1.8          {d5}, [r3,:64], r1
+    vst1.8          {d6}, [r3,:64], r1
+    vst1.8          {d7}, [r3,:64], r1
+    bx              lr
+endfunc
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index bbfdb6ae8d..bbff06df78 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -4467,6 +4467,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     if (CONFIG_VP3_DECODER) {
         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
+        c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
     }
     if (CONFIG_VP6_DECODER) {
         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index d1816e66ba..2c361b9f76 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -86,6 +86,7 @@ extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
 void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
 void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
 void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
 
 void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
 void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
@@ -373,6 +374,7 @@ typedef struct DSPContext {
     void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
     void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);
 
+    void (*vp3_idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
     void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
     void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
 
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index e46e6a437a..2e72fba0fc 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -1395,8 +1395,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
 
                 /* transform if this block was coded */
                 if (s->all_fragments[i].coding_method != MODE_COPY) {
-                    int intra = s->all_fragments[i].coding_method == MODE_INTRA;
-
                     if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
                         (s->all_fragments[i].coding_method == MODE_GOLDEN_MV))
                         motion_source= golden_plane;
@@ -1456,11 +1454,11 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                     }
 
                         s->dsp.clear_block(block);
-                        vp3_dequant(s, s->all_fragments + i, plane, !intra, block);
 
                     /* invert DCT and place (or add) in final output */
 
                     if (s->all_fragments[i].coding_method == MODE_INTRA) {
+                        vp3_dequant(s, s->all_fragments + i, plane, 0, block);
                         if(s->avctx->idct_algo!=FF_IDCT_VP3)
                             block[0] += 128<<3;
                         s->dsp.idct_put(
@@ -1468,10 +1466,14 @@ static void render_slice(Vp3DecodeContext *s, int slice)
                             stride,
                             block);
                     } else {
+                        if (vp3_dequant(s, s->all_fragments + i, plane, 1, block)) {
                         s->dsp.idct_add(
                             output_plane + first_pixel,
                             stride,
                             block);
+                        } else {
+                            s->dsp.vp3_idct_dc_add(output_plane + first_pixel, stride, block);
+                        }
                     }
                 } else {
 
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index 87b64de385..049758e671 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -223,6 +223,25 @@ void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*
     idct(dest, line_size, block, 2);
 }
 
+void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/){
+    const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    int i, dc = block[0];
+    dc = (46341*dc)>>16;
+    dc = (46341*dc + (8<<16))>>20;
+
+    for(i = 0; i < 8; i++){
+        dest[0] = cm[dest[0]+dc];
+        dest[1] = cm[dest[1]+dc];
+        dest[2] = cm[dest[2]+dc];
+        dest[3] = cm[dest[3]+dc];
+        dest[4] = cm[dest[4]+dc];
+        dest[5] = cm[dest[5]+dc];
+        dest[6] = cm[dest[6]+dc];
+        dest[7] = cm[dest[7]+dc];
+        dest += line_size;
+    }
+}
+
 void ff_vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, int *bounding_values)
 {
     unsigned char *end;
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 31387ebb30..cc2f881303 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2653,6 +2653,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
                 }
             }
+            if (CONFIG_VP3_DECODER) {
+                c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
+            }
 
 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
             c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
diff --git a/libavcodec/x86/vp3dsp_mmx.c b/libavcodec/x86/vp3dsp_mmx.c
index fead8e8cef..309dd4aa5d 100644
--- a/libavcodec/x86/vp3dsp_mmx.c
+++ b/libavcodec/x86/vp3dsp_mmx.c
@@ -395,3 +395,44 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
     ff_vp3_idct_mmx(block);
     add_pixels_clamped_mmx(block, dest, line_size);
 }
+
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
+{
+    int dc = block[0];
+    dc = (46341*dc)>>16;
+    dc = (46341*dc + (8<<16))>>20;
+
+    __asm__ volatile(
+        "movd          %3, %%mm0 \n\t"
+        "pshufw $0, %%mm0, %%mm0 \n\t"
+        "pxor       %%mm1, %%mm1 \n\t"
+        "psubw      %%mm0, %%mm1 \n\t"
+        "packuswb   %%mm0, %%mm0 \n\t"
+        "packuswb   %%mm1, %%mm1 \n\t"
+
+#define DC_ADD \
+        "movq        (%0), %%mm2 \n\t" \
+        "movq     (%0,%1), %%mm3 \n\t" \
+        "paddusb    %%mm0, %%mm2 \n\t" \
+        "movq   (%0,%1,2), %%mm4 \n\t" \
+        "paddusb    %%mm0, %%mm3 \n\t" \
+        "movq     (%0,%2), %%mm5 \n\t" \
+        "paddusb    %%mm0, %%mm4 \n\t" \
+        "paddusb    %%mm0, %%mm5 \n\t" \
+        "psubusb    %%mm1, %%mm2 \n\t" \
+        "psubusb    %%mm1, %%mm3 \n\t" \
+        "movq       %%mm2, (%0)  \n\t" \
+        "psubusb    %%mm1, %%mm4 \n\t" \
+        "movq       %%mm3, (%0,%1) \n\t" \
+        "psubusb    %%mm1, %%mm5 \n\t" \
+        "movq       %%mm4, (%0,%1,2) \n\t" \
+        "movq       %%mm5, (%0,%2) \n\t"
+
+        DC_ADD
+        "lea    (%0,%1,4), %0 \n\t"
+        DC_ADD
+
+        : "+r"(dest)
+        : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
+    );
+}
diff --git a/libavcodec/x86/vp3dsp_mmx.h b/libavcodec/x86/vp3dsp_mmx.h
index e565a33023..e0ebf0b0f4 100644
--- a/libavcodec/x86/vp3dsp_mmx.h
+++ b/libavcodec/x86/vp3dsp_mmx.h
@@ -28,6 +28,7 @@
 void ff_vp3_idct_mmx(int16_t *data);
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
 
 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
author	David Conrad <lessen42@gmail.com>	2010-04-17 02:04:30 +0000
committer	David Conrad <lessen42@gmail.com>	2010-04-17 02:04:30 +0000
commit	eb6a6cd788a172f146534c5fab9b98d6cbf59520 (patch)
tree	23225d7976eefaf0292342e6ee8b4ac946efcb8e
parent	f32f7d8b24d1228df447be85046b9346292d936e (diff)
download	ffmpeg-eb6a6cd788a172f146534c5fab9b98d6cbf59520.tar.gz