aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2012-04-19 22:36:17 +0200
committerDiego Biurrun <diego@biurrun.de>2012-05-10 18:42:43 +0200
commit110d0cdc9d1ec414a658f841a3fbefbf6f796d61 (patch)
treed2f80a035204c7a75a6daa5c71357e61817ffd54 /libavcodec
parent706b998cdcea97c50fad2228f67488de0e06b2a2 (diff)
downloadffmpeg-110d0cdc9d1ec414a658f841a3fbefbf6f796d61.tar.gz
rv40dsp x86: MMX/MMX2/3DNow/SSE2/SSSE3 implementations of MC
Code mostly inspired by vp8's MC, however: - its MMX2 horizontal filter is worse because it can't take advantage of the coefficient redundancy - that same coefficient redundancy allows better code for non-SSSE3 versions Benchmark (rounded to tens of unit): V8x8 H8x8 2D8x8 V16x16 H16x16 2D16x16 C 445 358 985 1785 1559 3280 MMX* 219 271 478 714 929 1443 SSE2 131 158 294 425 515 892 SSSE3 120 122 248 387 390 763 End result is overall around a 15% speedup for SSSE3 version (on 6 sequences); all loop filter functions now take around 55% of decoding time, while luma MC dsp functions are around 6%, chroma ones are 1.3% and biweight around 2.3%. Signed-off-by: Diego Biurrun <diego@biurrun.de>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/dsputil_mmx.c16
-rw-r--r--libavcodec/x86/dsputil_mmx.h5
-rw-r--r--libavcodec/x86/rv40dsp.asm316
-rw-r--r--libavcodec/x86/rv40dsp_init.c146
4 files changed, 480 insertions, 3 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 3ef19c5d13..6377a73555 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1791,6 +1791,22 @@ QPEL_2TAP(avg_, 16, 3dnow)
QPEL_2TAP(put_, 8, 3dnow)
QPEL_2TAP(avg_, 8, 3dnow)
+void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
+{
+ put_pixels8_xy2_mmx(dst, src, stride, 8);
+}
+void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
+{
+ put_pixels16_xy2_mmx(dst, src, stride, 16);
+}
+void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
+{
+ avg_pixels8_xy2_mmx(dst, src, stride, 8);
+}
+void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
+{
+ avg_pixels16_xy2_mmx(dst, src, stride, 16);
+}
#if HAVE_YASM
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
index 097739cf98..37f4581b9c 100644
--- a/libavcodec/x86/dsputil_mmx.h
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -199,6 +199,11 @@ void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
+void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
+void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
+void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
+void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
+
void ff_mmx_idct(DCTELEM *block);
void ff_mmxext_idct(DCTELEM *block);
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 721d3df094..e0213f40b9 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -1,5 +1,7 @@
;******************************************************************************
;* MMX/SSE2-optimized functions for the RV40 decoder
+;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
+;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of Libav.
@@ -25,11 +27,319 @@
SECTION_RODATA
align 16
-shift_round: times 8 dw 1 << (16 - 6)
-cextern pw_16
+pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
+
+sixtap_filter_hb_m: times 8 db 1, -5
+ times 8 db 52, 20
+ ; multiplied by 2 to have the same shift
+ times 8 db 2, -10
+ times 8 db 40, 40
+ ; back to normal
+ times 8 db 1, -5
+ times 8 db 20, 52
+
+sixtap_filter_v_m: times 8 dw 1
+ times 8 dw -5
+ times 8 dw 52
+ times 8 dw 20
+ ; multiplied by 2 to have the same shift
+ times 8 dw 2
+ times 8 dw -10
+ times 8 dw 40
+ times 8 dw 40
+ ; back to normal
+ times 8 dw 1
+ times 8 dw -5
+ times 8 dw 20
+ times 8 dw 52
+
+%ifdef PIC
+%define sixtap_filter_hw picregq
+%define sixtap_filter_hb picregq
+%define sixtap_filter_v picregq
+%define npicregs 1
+%else
+%define sixtap_filter_hw sixtap_filter_hw_m
+%define sixtap_filter_hb sixtap_filter_hb_m
+%define sixtap_filter_v sixtap_filter_v_m
+%define npicregs 0
+%endif
+
+filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
+
+cextern pw_32
+cextern pw_16
+cextern pw_512
SECTION .text
+;-----------------------------------------------------------------------------
+; subpel MC functions:
+;
+; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
+; uint8_t *src, int srcstride,
+; int len, int m);
+;----------------------------------------------------------------------
+%macro LOAD 2
+%if WIN64
+ movsxd %1q, %1d
+%endif
+%ifdef PIC
+ add %1q, picregq
+%else
+ add %1q, %2
+%endif
+%endmacro
+
+%macro STORE 3
+%ifidn %3, avg
+ movh %2, [dstq]
+%endif
+ packuswb %1, %1
+%ifidn %3, avg
+%if cpuflag(3dnow)
+ pavgusb %1, %2
+%else
+ pavgb %1, %2
+%endif
+%endif
+ movh [dstq], %1
+%endmacro
+
+%macro FILTER_V 1
+cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ pxor m7, m7
+ LOAD my, sixtap_filter_v
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ punpcklbw m4, m7
+
+%ifdef m8
+ mova m8, [myq+ 0]
+ mova m9, [myq+16]
+ mova m10, [myq+32]
+ mova m11, [myq+48]
+%define COEFF05 m8
+%define COEFF14 m9
+%define COEFF2 m10
+%define COEFF3 m11
+%else
+%define COEFF05 [myq+ 0]
+%define COEFF14 [myq+16]
+%define COEFF2 [myq+32]
+%define COEFF3 [myq+48]
+%endif
+.nextrow:
+ mova m6, m1
+ movh m5, [srcq+2*srcstrideq] ; read new row
+ paddw m6, m4
+ punpcklbw m5, m7
+ pmullw m6, COEFF14
+ paddw m0, m5
+ pmullw m0, COEFF05
+ paddw m6, m0
+ mova m0, m1
+ paddw m6, [pw_32]
+ mova m1, m2
+ pmullw m2, COEFF2
+ paddw m6, m2
+ mova m2, m3
+ pmullw m3, COEFF3
+ paddw m6, m3
+
+ ; round/clip/store
+ mova m3, m4
+ psraw m6, 6
+ mova m4, m5
+ STORE m6, m5, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+%macro FILTER_H 1
+cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_v_m]
+%endif
+ pxor m7, m7
+ LOAD mx, sixtap_filter_v
+ mova m6, [pw_32]
+%ifdef m8
+ mova m8, [mxq+ 0]
+ mova m9, [mxq+16]
+ mova m10, [mxq+32]
+ mova m11, [mxq+48]
+%define COEFF05 m8
+%define COEFF14 m9
+%define COEFF2 m10
+%define COEFF3 m11
+%else
+%define COEFF05 [mxq+ 0]
+%define COEFF14 [mxq+16]
+%define COEFF2 [mxq+32]
+%define COEFF3 [mxq+48]
+%endif
+.nextrow:
+ movq m0, [srcq-2]
+ movq m5, [srcq+3]
+ movq m1, [srcq-1]
+ movq m4, [srcq+2]
+ punpcklbw m0, m7
+ punpcklbw m5, m7
+ punpcklbw m1, m7
+ punpcklbw m4, m7
+ movq m2, [srcq-0]
+ movq m3, [srcq+1]
+ paddw m0, m5
+ paddw m1, m4
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ pmullw m0, COEFF05
+ pmullw m1, COEFF14
+ pmullw m2, COEFF2
+ pmullw m3, COEFF3
+ paddw m0, m6
+ paddw m1, m2
+ paddw m0, m3
+ paddw m0, m1
+ psraw m0, 6
+ STORE m0, m1, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+FILTER_V put
+FILTER_H put
+
+INIT_MMX mmx2
+FILTER_V avg
+FILTER_H avg
+
+INIT_MMX 3dnow
+FILTER_V avg
+FILTER_H avg
+%endif
+
+INIT_XMM sse2
+FILTER_H put
+FILTER_H avg
+FILTER_V put
+FILTER_V avg
+
+%macro FILTER_SSSE3 1
+cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+
+ ; read 5 lines
+ sub srcq, srcstrideq
+ LOAD my, sixtap_filter_hb
+ sub srcq, srcstrideq
+ movh m0, [srcq]
+ movh m1, [srcq+srcstrideq]
+ movh m2, [srcq+srcstrideq*2]
+ lea srcq, [srcq+srcstrideq*2]
+ add srcq, srcstrideq
+ mova m5, [myq]
+ movh m3, [srcq]
+ movh m4, [srcq+srcstrideq]
+ lea srcq, [srcq+2*srcstrideq]
+
+.nextrow:
+ mova m6, m2
+ punpcklbw m0, m1
+ punpcklbw m6, m3
+ pmaddubsw m0, m5
+ pmaddubsw m6, [myq+16]
+ movh m7, [srcq] ; read new row
+ paddw m6, m0
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ mova m4, m7
+ punpcklbw m7, m3
+ pmaddubsw m7, m5
+ paddw m6, m7
+ pmulhrsw m6, [pw_512]
+ STORE m6, m7, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
+%ifdef PIC
+ lea picregq, [sixtap_filter_hb_m]
+%endif
+ mova m3, [filter_h6_shuf2]
+ mova m4, [filter_h6_shuf3]
+ LOAD mx, sixtap_filter_hb
+ mova m5, [mxq] ; set up 6tap filter in bytes
+ mova m6, [mxq+16]
+ mova m7, [filter_h6_shuf1]
+
+.nextrow:
+ movu m0, [srcq-2]
+ mova m1, m0
+ mova m2, m0
+ pshufb m0, m7
+ pshufb m1, m3
+ pshufb m2, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ pmaddubsw m2, m5
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, [pw_512]
+ STORE m0, m1, %1
+
+ ; go to next line
+ add dstq, dststrideq
+ add srcq, srcstrideq
+ dec heightd ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+FILTER_SSSE3 put
+FILTER_SSSE3 avg
+
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
%macro RV40_WCORE 4-5
movh m4, [%3 + r6 + 0]
@@ -143,7 +453,7 @@ SECTION .text
%macro RV40_WEIGHT 3
cglobal rv40_weight_func_%1_%2, 6, 7, 8
%if cpuflag(ssse3)
- mova m1, [shift_round]
+ mova m1, [pw_1024]
%else
mova m1, [pw_16]
%endif
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index df468aa9e5..3f42363e4e 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -22,8 +22,11 @@
/**
* @file
* RV40 decoder motion compensation functions x86-optimised
+ * 2,0 and 0,2 have h264 equivalents.
+ * 3,3 is bugged in the rv40 format and maps to _xy2 version
*/
+#include "libavcodec/x86/dsputil_mmx.h"
#include "libavcodec/rv34dsp.h"
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
@@ -53,6 +56,132 @@ DECLARE_WEIGHT(mmx)
DECLARE_WEIGHT(sse2)
DECLARE_WEIGHT(ssse3)
+/** @{ */
+/**
+ * Define one qpel function.
+ * LOOPSIZE must be already set to the number of pixels processed per
+ * iteration in the inner loop of the called functions.
+ * COFF(x) must be already defined so as to provide the offset into any
+ * array of coeffs used by the called function for the qpel position x.
+ */
+#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
+static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
+ uint8_t *src, \
+ int stride) \
+{ \
+ int i; \
+ if (PH && PV) { \
+ DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
+ uint8_t *tmpptr = tmp + SIZE * 2; \
+ src -= stride * 2; \
+ \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
+ SIZE + 5, HCOFF(PH)); \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
+ SIZE, SIZE, VCOFF(PV)); \
+ } else if (PV) { \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
+ stride, SIZE, VCOFF(PV)); \
+ } else { \
+ for (i = 0; i < SIZE; i += LOOPSIZE) \
+ ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
+ stride, SIZE, HCOFF(PH)); \
+ } \
+};
+
+/** Declare functions for sizes 8 and 16 and given operations
+ * and qpel position. */
+#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
+ QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
+ QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
+
+/** Declare all functions for all sizes and qpel positions */
+#define QPEL_MC_DECL(OP, OPT) \
+void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
+ const uint8_t *src, \
+ ptrdiff_t srcStride, \
+ int len, int m); \
+void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
+ const uint8_t *src, \
+ ptrdiff_t srcStride, \
+ int len, int m); \
+QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
+QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
+QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
+QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
+QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
+QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
+QPEL_FUNCS_DECL(OP, 3, 2, OPT)
+/** @} */
+
+#define LOOPSIZE 8
+#define HCOFF(x) (32 * (x - 1))
+#define VCOFF(x) (32 * (x - 1))
+QPEL_MC_DECL(put_, _ssse3)
+QPEL_MC_DECL(avg_, _ssse3)
+
+#undef LOOPSIZE
+#undef HCOFF
+#undef VCOFF
+#define LOOPSIZE 8
+#define HCOFF(x) (64 * (x - 1))
+#define VCOFF(x) (64 * (x - 1))
+QPEL_MC_DECL(put_, _sse2)
+QPEL_MC_DECL(avg_, _sse2)
+
+#if ARCH_X86_32
+#undef LOOPSIZE
+#undef HCOFF
+#undef VCOFF
+#define LOOPSIZE 4
+#define HCOFF(x) (64 * (x - 1))
+#define VCOFF(x) (64 * (x - 1))
+
+QPEL_MC_DECL(put_, _mmx)
+
+#define ff_put_rv40_qpel_h_mmx2 ff_put_rv40_qpel_h_mmx
+#define ff_put_rv40_qpel_v_mmx2 ff_put_rv40_qpel_v_mmx
+QPEL_MC_DECL(avg_, _mmx2)
+
+#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
+#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
+QPEL_MC_DECL(avg_, _3dnow)
+#endif
+
+/** @{ */
+/** Set one function */
+#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
+ c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
+
+/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
+#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
+ QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
+ QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
+
+/** Set all functions for all sizes and qpel positions */
+#define QPEL_MC_SET(OP, OPT) \
+QPEL_FUNCS_SET (OP, 0, 1, OPT) \
+QPEL_FUNCS_SET (OP, 0, 3, OPT) \
+QPEL_FUNCS_SET (OP, 1, 0, OPT) \
+QPEL_FUNCS_SET (OP, 1, 1, OPT) \
+QPEL_FUNCS_SET (OP, 1, 2, OPT) \
+QPEL_FUNCS_SET (OP, 1, 3, OPT) \
+QPEL_FUNCS_SET (OP, 2, 1, OPT) \
+QPEL_FUNCS_SET (OP, 2, 2, OPT) \
+QPEL_FUNCS_SET (OP, 2, 3, OPT) \
+QPEL_FUNCS_SET (OP, 3, 0, OPT) \
+QPEL_FUNCS_SET (OP, 3, 1, OPT) \
+QPEL_FUNCS_SET (OP, 3, 2, OPT)
+/** @} */
+
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
{
#if HAVE_YASM
@@ -65,25 +194,42 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
+ c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
+ c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
+ c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
+ c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
+#if ARCH_X86_32
+ QPEL_MC_SET(put_, _mmx)
+#endif
}
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmx2;
+#if ARCH_X86_32
+ QPEL_MC_SET(avg_, _mmx2)
+#endif
} else if (mm_flags & AV_CPU_FLAG_3DNOW) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
+#if ARCH_X86_32
+ QPEL_MC_SET(avg_, _3dnow)
+#endif
}
if (mm_flags & AV_CPU_FLAG_SSE2) {
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
+ QPEL_MC_SET(put_, _sse2)
+ QPEL_MC_SET(avg_, _sse2)
}
if (mm_flags & AV_CPU_FLAG_SSSE3) {
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
+ QPEL_MC_SET(put_, _ssse3)
+ QPEL_MC_SET(avg_, _ssse3)
}
#endif
}