rv40dsp: implement prescaled versions for biweight.

Quite often, the original weights are multiple of 512. By prescaling them by 1/512 when they are computed (once per frame), no intermediate shifting is needed, and no prescaling on each call either. The x86 code already used that trick. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
author: Christophe GISQUET <christophe.gisquet@gmail.com> 2012-03-19 22:46:28 +0100
committer: Ronald S. Bultje <rsbultje@gmail.com> 2012-04-10 10:06:48 -0700
commit: 272b252c0110225188c7d7f31167941210aac197 (patch)
tree: 47bea5996c88057a418e8872a655bac8f261736e /libavcodec/x86
parent: d3c59d5003a483f1a23e225fc71c19bd1116d11c (diff)
download: ffmpeg-272b252c0110225188c7d7f31167941210aac197.tar.gz
2 files changed, 51 insertions, 49 deletions
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index bff3e7b96a..9028e74024 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -139,69 +139,61 @@ SECTION .text
 
 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 ; %1=size  %2=num of xmm regs
-%macro RV40_WEIGHT  2
-cglobal rv40_weight_func_%1, 6, 7, %2
+; The weights are FP0.14 notation of fractions depending on pts.
+; For timebases without rounding error (i.e. PAL), the fractions
+; can be simplified, and several operations can be avoided.
+; Therefore, we check here whether they are multiples of 2^9 for
+; those simplifications to occur.
+%macro RV40_WEIGHT  3
+cglobal rv40_weight_func_%1_%2, 6, 7, %3
 %if cpuflag(ssse3)
     mova       m1, [shift_round]
 %else
     mova       m1, [pw_16]
 %endif
     pxor       m0, m0
-    mov        r6, r3
-    or         r6, r4
-    ; The weights are FP0.14 notation of fractions depending on pts.
-    ; For timebases without rounding error (i.e. PAL), the fractions
-    ; can be simplified, and several operations can be avoided.
-    ; Therefore, we check here whether they are multiples of 2^9 for
-    ; those simplifications to occur.
-    and        r6, 0x1FF
     ; Set loop counter and increments
 %if mmsize == 8
-    mov        r6, %1
+    mov        r6, %2
 %else
-    mov        r6, (%1 * %1) / mmsize
+    mov        r6, (%2 * %2) / mmsize
 %endif
 
-    ; Use result of test now
-    jz .loop_512
     movd       m2, r3
     movd       m3, r4
+%ifidn %1,rnd
+%define  RND   0
     SPLATW     m2, m2
-    SPLATW     m3, m3
-
-.loop:
-    MAIN_LOOP  %1, 0
-    jnz        .loop
-    REP_RET
-
-    ; Weights are multiple of 512, which allows some shortcuts
-.loop_512:
-    sar        r3, 9
-    sar        r4, 9
-    movd       m2, r3
-    movd       m3, r4
+%else
+%define  RND   1
 %if cpuflag(ssse3)
     punpcklbw  m3, m2
-    SPLATW     m3, m3
 %else
     SPLATW     m2, m2
-    SPLATW     m3, m3
 %endif
-.loop2:
-    MAIN_LOOP  %1, 1
-    jnz        .loop2
-    REP_RET
+%endif
+    SPLATW     m3, m3
 
+.loop:
+    MAIN_LOOP  %2, RND
+    jnz        .loop
+    REP_RET
 %endmacro
 
 INIT_MMX mmx
-RV40_WEIGHT    8, 0
-RV40_WEIGHT   16, 0
+RV40_WEIGHT   rnd,    8, 3
+RV40_WEIGHT   rnd,   16, 4
+RV40_WEIGHT   nornd,  8, 3
+RV40_WEIGHT   nornd, 16, 4
 
 INIT_XMM sse2
-RV40_WEIGHT    8, 8
-RV40_WEIGHT   16, 8
+RV40_WEIGHT   rnd,    8, 3
+RV40_WEIGHT   rnd,   16, 4
+RV40_WEIGHT   nornd,  8, 3
+RV40_WEIGHT   nornd, 16, 4
 
 INIT_XMM ssse3
-RV40_WEIGHT    8, 8
-RV40_WEIGHT   16, 8
+RV40_WEIGHT   rnd,    8, 3
+RV40_WEIGHT   rnd,   16, 4
+RV40_WEIGHT   nornd,  8, 3
+RV40_WEIGHT   nornd, 16, 4
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 79c70f78c3..df468aa9e5 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
                                   int stride, int h, int x, int y);
 
 #define DECLARE_WEIGHT(opt) \
-void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
-                                  int w1, int w2, ptrdiff_t stride); \
-void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
-                                  int w1, int w2, ptrdiff_t stride);
+void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                      int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                      int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                        int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+                                        int w1, int w2, ptrdiff_t stride);
 DECLARE_WEIGHT(mmx)
 DECLARE_WEIGHT(sse2)
 DECLARE_WEIGHT(ssse3)
@@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
     if (mm_flags & AV_CPU_FLAG_MMX) {
         c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
         c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
-        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx;
-        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
     }
     if (mm_flags & AV_CPU_FLAG_MMX2) {
         c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
@@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
         c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
     }
     if (mm_flags & AV_CPU_FLAG_SSE2) {
-        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2;
-        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
     }
     if (mm_flags & AV_CPU_FLAG_SSSE3) {
-        c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3;
-        c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3;
+        c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
+        c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
+        c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
+        c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
     }
 #endif
 }
author	Christophe GISQUET <christophe.gisquet@gmail.com>	2012-03-19 22:46:28 +0100
committer	Ronald S. Bultje <rsbultje@gmail.com>	2012-04-10 10:06:48 -0700
commit	272b252c0110225188c7d7f31167941210aac197 (patch)
tree	47bea5996c88057a418e8872a655bac8f261736e /libavcodec/x86
parent	d3c59d5003a483f1a23e225fc71c19bd1116d11c (diff)
download	ffmpeg-272b252c0110225188c7d7f31167941210aac197.tar.gz