aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2011-03-20 13:31:36 -0400
committerJustin Ruggles <justin.ruggles@gmail.com>2011-03-22 21:08:30 -0400
commite6e9823488b4cf42778411f1239592f0787e121e (patch)
treedcdb0a902e921957a3e0b912c56a73d7fb67422d /libavcodec/x86
parente971d81364e93feae8c399075a3be2643192e031 (diff)
downloadffmpeg-e6e9823488b4cf42778411f1239592f0787e121e.tar.gz
Add apply_window_int16() to DSPContext with x86-optimized versions and use it
in the ac3_fixed encoder.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/dsputil_mmx.c40
-rw-r--r--libavcodec/x86/dsputil_yasm.asm126
2 files changed, 164 insertions, 2 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index a611a71f06..4d1a3052ba 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2388,6 +2388,20 @@ int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int or
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
+void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
+ const int16_t *window, unsigned int len);
+
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2749,6 +2763,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
+ if (avctx->flags & CODEC_FLAG_BITEXACT) {
+ c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
+ } else {
+ c->apply_window_int16 = ff_apply_window_int16_mmxext;
+ }
#endif
}
if(mm_flags & AV_CPU_FLAG_SSE){
@@ -2771,13 +2790,30 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+ if (avctx->flags & CODEC_FLAG_BITEXACT) {
+ c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
+ } else {
+ if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+ c->apply_window_int16 = ff_apply_window_int16_sse2;
+ }
+ }
c->emulated_edge_mc = emulated_edge_mc_sse;
c->gmc= gmc_sse;
#endif
}
- if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+ if (mm_flags & AV_CPU_FLAG_SSSE3) {
+#if HAVE_YASM
+ if (mm_flags & AV_CPU_FLAG_ATOM) {
+ c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
+ } else {
+ c->apply_window_int16 = ff_apply_window_int16_ssse3;
+ }
+ if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+ }
+#endif
+ }
}
if (CONFIG_ENCODERS)
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 07dd995599..f2c205b659 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -27,6 +27,8 @@ pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+pd_16384: times 4 dd 16384
section .text align=16
@@ -202,6 +204,130 @@ SCALARPRODUCT_LOOP 0
RET
+;-----------------------------------------------------------------------------
+; void ff_apply_window_int16(int16_t *output, const int16_t *input,
+; const int16_t *window, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro REVERSE_WORDS_MMXEXT 1-2
+ pshufw %1, %1, 0x1B
+%endmacro
+
+%macro REVERSE_WORDS_SSE2 1-2
+ pshuflw %1, %1, 0x1B
+ pshufhw %1, %1, 0x1B
+ pshufd %1, %1, 0x4E
+%endmacro
+
+%macro REVERSE_WORDS_SSSE3 2
+ pshufb %1, %2
+%endmacro
+
+; dst = (dst * src) >> 15
+; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
+; in from the pmullw result.
+%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
+ mova %3, %1
+ pmulhw %1, %2
+ pmullw %3, %2
+ psrlw %3, 15
+ psllw %1, 1
+ por %1, %3
+%endmacro
+
+; dst = ((dst * src) + (1<<14)) >> 15
+%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
+ pmulhrsw %1, %2
+%endmacro
+
+%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
+cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
+ lea offset2q, [offsetq-mmsize]
+%if %2
+ mova m5, [pd_16384]
+%elifidn %1, ssse3
+ mova m5, [pb_revwords]
+ ALIGN 16
+%endif
+.loop:
+%if %2
+ ; This version expands 16-bit to 32-bit, multiplies by the window,
+ ; adds 16384 for rounding, right shifts 15, then repacks back to words to
+ ; save to the output. The window is reversed for the second half.
+ mova m3, [windowq+offset2q]
+ mova m4, [ inputq+offset2q]
+ pxor m0, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m4
+ pmaddwd m0, m1
+ paddd m0, m5
+ psrad m0, 15
+ pxor m2, m2
+ punpckhwd m2, m3
+ punpckhwd m1, m4
+ pmaddwd m2, m1
+ paddd m2, m5
+ psrad m2, 15
+ packssdw m0, m2
+ mova [outputq+offset2q], m0
+ REVERSE_WORDS m3
+ mova m4, [ inputq+offsetq]
+ pxor m0, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m4
+ pmaddwd m0, m1
+ paddd m0, m5
+ psrad m0, 15
+ pxor m2, m2
+ punpckhwd m2, m3
+ punpckhwd m1, m4
+ pmaddwd m2, m1
+ paddd m2, m5
+ psrad m2, 15
+ packssdw m0, m2
+ mova [outputq+offsetq], m0
+%elif %3
+ ; This version does the 16x16->16 multiplication in-place without expanding
+ ; to 32-bit. The ssse3 version is bit-identical.
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ pmulhrsw m1, m0
+ REVERSE_WORDS m0, m5
+ pmulhrsw m0, [ inputq+offsetq ]
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m0
+%else
+ ; This version does the 16x16->16 multiplication in-place without expanding
+ ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
+ ; therefore are not bit-identical to the C version.
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ mova m2, [ inputq+offsetq ]
+ MUL16FIXED m1, m0, m3
+ REVERSE_WORDS m0
+ MUL16FIXED m2, m0, m3
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m2
+%endif
+ add offsetd, mmsize
+ sub offset2d, mmsize
+ jae .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
+%define MUL16FIXED MUL16FIXED_MMXEXT
+APPLY_WINDOW_INT16 mmxext, 0, 0
+APPLY_WINDOW_INT16 mmxext_ba, 1, 0
+INIT_XMM
+%define REVERSE_WORDS REVERSE_WORDS_SSE2
+APPLY_WINDOW_INT16 sse2, 0, 0
+APPLY_WINDOW_INT16 sse2_ba, 1, 0
+APPLY_WINDOW_INT16 ssse3_atom, 0, 1
+%define REVERSE_WORDS REVERSE_WORDS_SSSE3
+APPLY_WINDOW_INT16 ssse3, 0, 1
+
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top