diff options
author | Diego Biurrun <diego@biurrun.de> | 2014-03-26 04:41:28 -0700 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2014-04-04 19:08:05 +0200 |
commit | 57b5b84e208ad61ffdd74ad849bed212deb92bc5 (patch) | |
tree | c6e408f756b6b780579b676a9f286105a2c99a8e | |
parent | c2c5be57494e6117086771bca34c8cd4c72c8e99 (diff) | |
download | ffmpeg-57b5b84e208ad61ffdd74ad849bed212deb92bc5.tar.gz |
x86: dsputil: Move ff_apply_window_int16_* bits to ac3dsp, where they belong
-rw-r--r-- | libavcodec/x86/ac3dsp.asm | 131 | ||||
-rw-r--r-- | libavcodec/x86/dsputil.asm | 130 |
2 files changed, 131 insertions, 130 deletions
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index d0ab3e7359..817d5a319c 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -35,6 +35,10 @@ pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 pd_1: times 4 dd 1 pd_151: times 4 dd 151 +; used in ff_apply_window_int16() +pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 +pd_16384: times 4 dd 16384 + SECTION .text ;----------------------------------------------------------------------------- @@ -419,3 +423,130 @@ AC3_EXTRACT_EXPONENTS INIT_XMM ssse3 AC3_EXTRACT_EXPONENTS %endif + +;----------------------------------------------------------------------------- +; void ff_apply_window_int16(int16_t *output, const int16_t *input, +; const int16_t *window, unsigned int len) +;----------------------------------------------------------------------------- + +%macro REVERSE_WORDS 1-2 +%if cpuflag(ssse3) && notcpuflag(atom) + pshufb %1, %2 +%elif cpuflag(sse2) + pshuflw %1, %1, 0x1B + pshufhw %1, %1, 0x1B + pshufd %1, %1, 0x4E +%elif cpuflag(mmxext) + pshufw %1, %1, 0x1B +%endif +%endmacro + +%macro MUL16FIXED 3 +%if cpuflag(ssse3) ; dst, src, unused +; dst = ((dst * src) + (1<<14)) >> 15 + pmulhrsw %1, %2 +%elif cpuflag(mmxext) ; dst, src, temp +; dst = (dst * src) >> 15 +; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back +; in from the pmullw result. + mova %3, %1 + pmulhw %1, %2 + pmullw %3, %2 + psrlw %3, 15 + psllw %1, 1 + por %1, %3 +%endif +%endmacro + +%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version +%if %1 +cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2 +%else +cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2 +%endif + lea offset2q, [offsetq-mmsize] +%if cpuflag(ssse3) && notcpuflag(atom) + mova m5, [pb_revwords] + ALIGN 16 +%elif %1 + mova m5, [pd_16384] +%endif +.loop: +%if cpuflag(ssse3) + ; This version does the 16x16->16 multiplication in-place without expanding + ; to 32-bit. The ssse3 version is bit-identical. + mova m0, [windowq+offset2q] + mova m1, [ inputq+offset2q] + pmulhrsw m1, m0 + REVERSE_WORDS m0, m5 + pmulhrsw m0, [ inputq+offsetq ] + mova [outputq+offset2q], m1 + mova [outputq+offsetq ], m0 +%elif %1 + ; This version expands 16-bit to 32-bit, multiplies by the window, + ; adds 16384 for rounding, right shifts 15, then repacks back to words to + ; save to the output. The window is reversed for the second half. + mova m3, [windowq+offset2q] + mova m4, [ inputq+offset2q] + pxor m0, m0 + punpcklwd m0, m3 + punpcklwd m1, m4 + pmaddwd m0, m1 + paddd m0, m5 + psrad m0, 15 + pxor m2, m2 + punpckhwd m2, m3 + punpckhwd m1, m4 + pmaddwd m2, m1 + paddd m2, m5 + psrad m2, 15 + packssdw m0, m2 + mova [outputq+offset2q], m0 + REVERSE_WORDS m3 + mova m4, [ inputq+offsetq] + pxor m0, m0 + punpcklwd m0, m3 + punpcklwd m1, m4 + pmaddwd m0, m1 + paddd m0, m5 + psrad m0, 15 + pxor m2, m2 + punpckhwd m2, m3 + punpckhwd m1, m4 + pmaddwd m2, m1 + paddd m2, m5 + psrad m2, 15 + packssdw m0, m2 + mova [outputq+offsetq], m0 +%else + ; This version does the 16x16->16 multiplication in-place without expanding + ; to 32-bit. The mmxext and sse2 versions do not use rounding, and + ; therefore are not bit-identical to the C version. + mova m0, [windowq+offset2q] + mova m1, [ inputq+offset2q] + mova m2, [ inputq+offsetq ] + MUL16FIXED m1, m0, m3 + REVERSE_WORDS m0 + MUL16FIXED m2, m0, m3 + mova [outputq+offset2q], m1 + mova [outputq+offsetq ], m2 +%endif + add offsetd, mmsize + sub offset2d, mmsize + jae .loop + REP_RET +%endmacro + +INIT_MMX mmxext +APPLY_WINDOW_INT16 0 +INIT_XMM sse2 +APPLY_WINDOW_INT16 0 + +INIT_MMX mmxext +APPLY_WINDOW_INT16 1 +INIT_XMM sse2 +APPLY_WINDOW_INT16 1 +INIT_XMM ssse3 +APPLY_WINDOW_INT16 1 +INIT_XMM ssse3, atom +APPLY_WINDOW_INT16 1 diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm index 414d2124db..a8c8fdac2f 100644 --- a/libavcodec/x86/dsputil.asm +++ b/libavcodec/x86/dsputil.asm @@ -27,8 +27,6 @@ pb_zzzzzzzz77777777: times 8 db -1 pb_7: times 8 db 7 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 -pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 -pd_16384: times 4 dd 16384 pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 SECTION_TEXT @@ -205,134 +203,6 @@ SCALARPRODUCT_LOOP 0 RET -;----------------------------------------------------------------------------- -; void ff_apply_window_int16(int16_t *output, const int16_t *input, -; const int16_t *window, unsigned int len) -;----------------------------------------------------------------------------- - -%macro REVERSE_WORDS 1-2 -%if cpuflag(ssse3) && notcpuflag(atom) - pshufb %1, %2 -%elif cpuflag(sse2) - pshuflw %1, %1, 0x1B - pshufhw %1, %1, 0x1B - pshufd %1, %1, 0x4E -%elif cpuflag(mmxext) - pshufw %1, %1, 0x1B -%endif -%endmacro - -%macro MUL16FIXED 3 -%if cpuflag(ssse3) ; dst, src, unused -; dst = ((dst * src) + (1<<14)) >> 15 - pmulhrsw %1, %2 -%elif cpuflag(mmxext) ; dst, src, temp -; dst = (dst * src) >> 15 -; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back -; in from the pmullw result. - mova %3, %1 - pmulhw %1, %2 - pmullw %3, %2 - psrlw %3, 15 - psllw %1, 1 - por %1, %3 -%endif -%endmacro - -%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version -%if %1 -cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2 -%else -cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2 -%endif - lea offset2q, [offsetq-mmsize] -%if cpuflag(ssse3) && notcpuflag(atom) - mova m5, [pb_revwords] - ALIGN 16 -%elif %1 - mova m5, [pd_16384] -%endif -.loop: -%if cpuflag(ssse3) - ; This version does the 16x16->16 multiplication in-place without expanding - ; to 32-bit. The ssse3 version is bit-identical. - mova m0, [windowq+offset2q] - mova m1, [ inputq+offset2q] - pmulhrsw m1, m0 - REVERSE_WORDS m0, m5 - pmulhrsw m0, [ inputq+offsetq ] - mova [outputq+offset2q], m1 - mova [outputq+offsetq ], m0 -%elif %1 - ; This version expands 16-bit to 32-bit, multiplies by the window, - ; adds 16384 for rounding, right shifts 15, then repacks back to words to - ; save to the output. The window is reversed for the second half. - mova m3, [windowq+offset2q] - mova m4, [ inputq+offset2q] - pxor m0, m0 - punpcklwd m0, m3 - punpcklwd m1, m4 - pmaddwd m0, m1 - paddd m0, m5 - psrad m0, 15 - pxor m2, m2 - punpckhwd m2, m3 - punpckhwd m1, m4 - pmaddwd m2, m1 - paddd m2, m5 - psrad m2, 15 - packssdw m0, m2 - mova [outputq+offset2q], m0 - REVERSE_WORDS m3 - mova m4, [ inputq+offsetq] - pxor m0, m0 - punpcklwd m0, m3 - punpcklwd m1, m4 - pmaddwd m0, m1 - paddd m0, m5 - psrad m0, 15 - pxor m2, m2 - punpckhwd m2, m3 - punpckhwd m1, m4 - pmaddwd m2, m1 - paddd m2, m5 - psrad m2, 15 - packssdw m0, m2 - mova [outputq+offsetq], m0 -%else - ; This version does the 16x16->16 multiplication in-place without expanding - ; to 32-bit. The mmxext and sse2 versions do not use rounding, and - ; therefore are not bit-identical to the C version. - mova m0, [windowq+offset2q] - mova m1, [ inputq+offset2q] - mova m2, [ inputq+offsetq ] - MUL16FIXED m1, m0, m3 - REVERSE_WORDS m0 - MUL16FIXED m2, m0, m3 - mova [outputq+offset2q], m1 - mova [outputq+offsetq ], m2 -%endif - add offsetd, mmsize - sub offset2d, mmsize - jae .loop - REP_RET -%endmacro - -INIT_MMX mmxext -APPLY_WINDOW_INT16 0 -INIT_XMM sse2 -APPLY_WINDOW_INT16 0 - -INIT_MMX mmxext -APPLY_WINDOW_INT16 1 -INIT_XMM sse2 -APPLY_WINDOW_INT16 1 -INIT_XMM ssse3 -APPLY_WINDOW_INT16 1 -INIT_XMM ssse3, atom -APPLY_WINDOW_INT16 1 - - ; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, ; const uint8_t *diff, int w, ; int *left, int *left_top) |