diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2011-10-22 01:03:27 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2011-10-22 01:16:41 +0200 |
commit | aedc908601de7396751a9a4504e064782d9f6a0b (patch) | |
tree | 8f04b899142439893bac426ac83d05c4068b099c /libavcodec/x86/h264_weight.asm | |
parent | 1a7090bfafe986d4470ba8059c815939171ddb74 (diff) | |
parent | f4b51d061f0f34e36be876b562b8abe47f4b9c1c (diff) | |
download | ffmpeg-aedc908601de7396751a9a4504e064782d9f6a0b.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (35 commits)
flvdec: Do not call parse_keyframes_index with a NULL stream
libspeexdec: include system headers before local headers
libspeexdec: return meaningful error codes
libspeexdec: cosmetics: reindent
libspeexdec: decode one frame at a time.
swscale: fix signed shift overflows in ff_yuv2rgb_c_init_tables()
Move timefilter code from lavf to lavd.
mov: add support for hdvd and pgapmetadata atoms
mov: rename function _stik, some indentation cosmetics
mov: rename function _int8 to remove ambiguity, some indentation cosmetics
mov: parse the gnre atom
mp3on4: check for allocation failures in decode_init_mp3on4()
mp3on4: create a separate flush function for MP3onMP4.
mp3on4: ensure that the frame channel count does not exceed the codec channel count.
mp3on4: set channel layout
mp3on4: fix the output channel order
mp3on4: allocate temp buffer with av_malloc() instead of on the stack.
mp3on4: copy MPADSPContext from first context to all contexts.
fmtconvert: port float_to_int16_interleave() 2-channel x86 inline asm to yasm
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm
...
Conflicts:
libavcodec/arm/h264dsp_init_arm.c
libavcodec/h264.c
libavcodec/h264.h
libavcodec/h264_cabac.c
libavcodec/h264_cavlc.c
libavcodec/h264_ps.c
libavcodec/h264dsp_template.c
libavcodec/h264idct_template.c
libavcodec/h264pred.c
libavcodec/h264pred_template.c
libavcodec/x86/h264dsp_mmx.c
libavdevice/Makefile
libavdevice/jack_audio.c
libavformat/Makefile
libavformat/flvdec.c
libavformat/flvenc.c
libavutil/pixfmt.h
libswscale/utils.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/h264_weight.asm')
-rw-r--r-- | libavcodec/x86/h264_weight.asm | 210 |
1 files changed, 79 insertions, 131 deletions
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index bb0af86097..cc96cb1f3b 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -28,21 +28,20 @@ SECTION .text ;----------------------------------------------------------------------------- ; biweight pred: ; -; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, -; int log2_denom, int weightd, int weights, -; int offset); +; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int height, int log2_denom, int weightd, +; int weights, int offset); ; and -; void h264_weight_16x16_sse2(uint8_t *dst, int stride, -; int log2_denom, int weight, -; int offset); +; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, +; int log2_denom, int weight, int offset); ;----------------------------------------------------------------------------- %macro WEIGHT_SETUP 0 - add r4, r4 - inc r4 - movd m3, r3d - movd m5, r4d - movd m6, r2d + add r5, r5 + inc r5 + movd m3, r4d + movd m5, r5d + movd m6, r3d pslld m5, m6 psrld m5, 1 %if mmsize == 16 @@ -71,60 +70,41 @@ SECTION .text packuswb m0, m1 %endmacro -%macro WEIGHT_FUNC_DBL_MM 1 -cglobal h264_weight_16x%1_mmx2, 5, 5, 0 +INIT_MMX +cglobal h264_weight_16_mmx2, 6, 6, 0 WEIGHT_SETUP - mov r2, %1 -%if %1 == 16 .nextrow WEIGHT_OP 0, 4 mova [r0 ], m0 WEIGHT_OP 8, 12 mova [r0+8], m0 add r0, r1 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) -%endif -%endmacro -INIT_MMX -WEIGHT_FUNC_DBL_MM 16 -WEIGHT_FUNC_DBL_MM 8 - -%macro WEIGHT_FUNC_MM 4 -cglobal h264_weight_%1x%2_%4, 7, 7, %3 +%macro WEIGHT_FUNC_MM 3 +cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP - mov r2, %2 -%if %2 == 16 .nextrow WEIGHT_OP 0, mmsize/2 mova [r0], m0 add r0, r1 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_%1x16_%4.nextrow) -%endif %endmacro INIT_MMX -WEIGHT_FUNC_MM 8, 16, 0, mmx2 -WEIGHT_FUNC_MM 8, 8, 0, mmx2 -WEIGHT_FUNC_MM 8, 4, 0, mmx2 +WEIGHT_FUNC_MM 8, 0, mmx2 INIT_XMM -WEIGHT_FUNC_MM 16, 16, 8, sse2 -WEIGHT_FUNC_MM 16, 8, 8, sse2 +WEIGHT_FUNC_MM 16, 8, sse2 -%macro WEIGHT_FUNC_HALF_MM 5 -cglobal h264_weight_%1x%2_%5, 5, 5, %4 +%macro WEIGHT_FUNC_HALF_MM 3 +cglobal h264_weight_%1_%3, 6, 6, %2 WEIGHT_SETUP - mov r2, %2/2 + sar r2d, 1 lea r3, [r1*2] -%if %2 == mmsize .nextrow WEIGHT_OP 0, r1 movh [r0], m0 @@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4 movh [r0+r1], m0 %endif add r0, r3 - dec r2 + dec r2d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) -%endif %endmacro INIT_MMX -WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 -WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 -WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 0, mmx2 INIT_XMM -WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 -WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 -WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m3, r4d - movd m4, r5d - movd m5, r6d - movd m6, r3d +%ifdef ARCH_X86_64 +%define off_regd r11d +%else +%define off_regd r3d +%endif + mov off_regd, r7m + add off_regd, 1 + or off_regd, 1 + add r4, 1 + movd m3, r5d + movd m4, r6d + movd m5, off_regd + movd m6, r4d pslld m5, m6 psrld m5, 1 %if mmsize == 16 @@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 packuswb m0, m1 %endmacro -%macro BIWEIGHT_FUNC_DBL_MM 1 -cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 +INIT_MMX +cglobal h264_biweight_16_mmx2, 7, 7, 0 BIWEIGHT_SETUP - mov r3, %1 -%if %1 == 16 + movifnidn r3d, r3m .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, 4 @@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 mova [r0+8], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) -%endif -%endmacro -INIT_MMX -BIWEIGHT_FUNC_DBL_MM 16 -BIWEIGHT_FUNC_DBL_MM 8 - -%macro BIWEIGHT_FUNC_MM 4 -cglobal h264_biweight_%1x%2_%4, 7, 7, %3 +%macro BIWEIGHT_FUNC_MM 3 +cglobal h264_biweight_%1_%3, 7, 7, %2 BIWEIGHT_SETUP - mov r3, %2 -%if %2 == 16 + movifnidn r3d, r3m .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, mmsize/2 @@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3 mova [r0], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) -%endif %endmacro INIT_MMX -BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 -BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 -BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 0, mmx2 INIT_XMM -BIWEIGHT_FUNC_MM 16, 16, 8, sse2 -BIWEIGHT_FUNC_MM 16, 8, 8, sse2 +BIWEIGHT_FUNC_MM 16, 8, sse2 -%macro BIWEIGHT_FUNC_HALF_MM 5 -cglobal h264_biweight_%1x%2_%5, 7, 7, %4 +%macro BIWEIGHT_FUNC_HALF_MM 3 +cglobal h264_biweight_%1_%3, 7, 7, %2 BIWEIGHT_SETUP - mov r3, %2/2 + movifnidn r3d, r3m + sar r3, 1 lea r4, [r2*2] -%if %2 == mmsize .nextrow BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 1, 2, r2 @@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4 %endif add r0, r4 add r1, r4 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) -%endif %endmacro INIT_MMX -BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 -BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 -BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2 INIT_XMM -BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 -BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 -BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 %macro BIWEIGHT_SSSE3_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m4, r4d - movd m0, r5d - movd m5, r6d - movd m6, r3d +%ifdef ARCH_X86_64 +%define off_regd r11d +%else +%define off_regd r3d +%endif + mov off_regd, r7m + add off_regd, 1 + or off_regd, 1 + add r4, 1 + movd m4, r5d + movd m0, r6d + movd m5, off_regd + movd m6, r4d pslld m5, m6 psrld m5, 1 punpcklbw m4, m0 @@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 packuswb m0, m2 %endmacro -%macro BIWEIGHT_SSSE3_16 1 -cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 +INIT_XMM +cglobal h264_biweight_16_ssse3, 7, 7, 8 BIWEIGHT_SSSE3_SETUP - mov r3, %1 + movifnidn r3d, r3m -%if %1 == 16 .nextrow movh m0, [r0] movh m2, [r0+8] @@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 mova [r0], m0 add r0, r2 add r1, r2 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) -%endif -%endmacro INIT_XMM -BIWEIGHT_SSSE3_16 16 -BIWEIGHT_SSSE3_16 8 - -%macro BIWEIGHT_SSSE3_8 1 -cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 +cglobal h264_biweight_8_ssse3, 7, 7, 8 BIWEIGHT_SSSE3_SETUP - mov r3, %1/2 + movifnidn r3d, r3m + sar r3, 1 lea r4, [r2*2] -%if %1 == 16 .nextrow movh m0, [r0] movh m1, [r1] @@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 movhps [r0+r2], m0 add r0, r4 add r1, r4 - dec r3 + dec r3d jnz .nextrow REP_RET -%else - jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) -%endif -%endmacro - -INIT_XMM -BIWEIGHT_SSSE3_8 16 -BIWEIGHT_SSSE3_8 8 -BIWEIGHT_SSSE3_8 4 |