diff options
author | Diego Biurrun <diego@biurrun.de> | 2012-07-04 01:07:42 +0200 |
---|---|---|
committer | Diego Biurrun <diego@biurrun.de> | 2012-07-05 17:37:10 +0200 |
commit | d20f133ef962da71326bc3635e086696f45ab64e (patch) | |
tree | 795d25246c02b5ed21bc8d6bc1e096a223dfdcca /libavcodec/x86/h264_intrapred.asm | |
parent | 2696789c52cd4a8a2ff554d2329c60001459c21b (diff) | |
download | ffmpeg-d20f133ef962da71326bc3635e086696f45ab64e.tar.gz |
x86: h264_intrapred: port to cpuflag macros
Diffstat (limited to 'libavcodec/x86/h264_intrapred.asm')
-rw-r--r-- | libavcodec/x86/h264_intrapred.asm | 290 |
1 files changed, 156 insertions, 134 deletions
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index 3beb3b9d6d..87d32c4ee3 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -87,23 +87,23 @@ cglobal pred16x16_vertical_sse, 2,3 ; void pred16x16_horizontal(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_H 1 -cglobal pred16x16_horizontal_%1, 2,3 +%macro PRED16x16_H 0 +cglobal pred16x16_horizontal, 2,3 mov r2, 8 -%ifidn %1, ssse3 +%if cpuflag(ssse3) mova m2, [pb_3] %endif .loop: movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] -%ifidn %1, ssse3 +%if cpuflag(ssse3) pshufb m0, m2 pshufb m1, m2 %else punpcklbw m0, m0 punpcklbw m1, m1 -%ifidn %1, mmxext +%if cpuflag(mmx2) pshufw m0, m0, 0xff pshufw m1, m1, 0xff %else @@ -124,18 +124,20 @@ cglobal pred16x16_horizontal_%1, 2,3 REP_RET %endmacro -INIT_MMX -PRED16x16_H mmx -PRED16x16_H mmxext +INIT_MMX mmx +PRED16x16_H +INIT_MMX mmx2 +PRED16x16_H +INIT_XMM ssse3 +PRED16x16_H INIT_XMM -PRED16x16_H ssse3 ;----------------------------------------------------------------------------- ; void pred16x16_dc(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_DC 1 -cglobal pred16x16_dc_%1, 2,7 +%macro PRED16x16_DC 0 +cglobal pred16x16_dc, 2,7 mov r4, r0 sub r0, r1 pxor mm0, mm0 @@ -158,19 +160,19 @@ cglobal pred16x16_dc_%1, 2,7 add r5d, r6d lea r2d, [r2+r5+16] shr r2d, 5 -%ifidn %1, mmxext +%if cpuflag(ssse3) + pxor m1, m1 movd m0, r2d - punpcklbw m0, m0 - pshufw m0, m0, 0 -%elifidn %1, sse2 + pshufb m0, m1 +%elif cpuflag(sse2) movd m0, r2d punpcklbw m0, m0 pshuflw m0, m0, 0 punpcklqdq m0, m0 -%elifidn %1, ssse3 - pxor m1, m1 +%elif cpuflag(mmx2) movd m0, r2d - pshufb m0, m1 + punpcklbw m0, m0 + pshufw m0, m0, 0 %endif %if mmsize==8 @@ -195,18 +197,20 @@ cglobal pred16x16_dc_%1, 2,7 REP_RET %endmacro -INIT_MMX -PRED16x16_DC mmxext +INIT_MMX mmx2 +PRED16x16_DC +INIT_XMM sse2 +PRED16x16_DC +INIT_XMM ssse3 +PRED16x16_DC INIT_XMM -PRED16x16_DC sse2 -PRED16x16_DC ssse3 ;----------------------------------------------------------------------------- ; void pred16x16_tm_vp8(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED16x16_TM_MMX 1 -cglobal pred16x16_tm_vp8_%1, 2,5 +%macro PRED16x16_TM_MMX 0 +cglobal pred16x16_tm_vp8, 2,5 sub r0, r1 pxor mm7, mm7 movq mm0, [r0+0] @@ -223,11 +227,11 @@ cglobal pred16x16_tm_vp8_%1, 2,5 movzx r2d, byte [r0+r1-1] sub r2d, r3d movd mm4, r2d -%ifidn %1, mmx +%if cpuflag(mmx2) + pshufw mm4, mm4, 0 +%else punpcklwd mm4, mm4 punpckldq mm4, mm4 -%else - pshufw mm4, mm4, 0 %endif movq mm5, mm4 movq mm6, mm4 @@ -246,8 +250,11 @@ cglobal pred16x16_tm_vp8_%1, 2,5 REP_RET %endmacro -PRED16x16_TM_MMX mmx -PRED16x16_TM_MMX mmxext +INIT_MMX mmx +PRED16x16_TM_MMX +INIT_MMX mmx2 +PRED16x16_TM_MMX +INIT_MMX cglobal pred16x16_tm_vp8_sse2, 2,6,6 sub r0, r1 @@ -288,8 +295,8 @@ cglobal pred16x16_tm_vp8_sse2, 2,6,6 ; void pred16x16_plane(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H264_PRED16x16_PLANE 3 -cglobal pred16x16_plane_%3_%1, 2, 9, %2 +%macro H264_PRED16x16_PLANE 1 +cglobal pred16x16_plane_%1, 2,9,7 mov r2, r1 ; +stride neg r1 ; -stride @@ -310,7 +317,10 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 paddw m0, m2 paddw m1, m3 %else ; mmsize == 16 -%ifidn %1, sse2 +%if cpuflag(ssse3) + movhps m0, [r0+r1 +8] + pmaddubsw m0, [plane_shuf] ; H coefficients +%else ; sse2 pxor m2, m2 movh m1, [r0+r1 +8] punpcklbw m0, m2 @@ -318,29 +328,26 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 pmullw m0, [pw_m8tom1] pmullw m1, [pw_1to8] paddw m0, m1 -%else ; ssse3 - movhps m0, [r0+r1 +8] - pmaddubsw m0, [plane_shuf] ; H coefficients %endif movhlps m1, m0 %endif paddw m0, m1 -%ifidn %1, mmx +%if cpuflag(sse2) + pshuflw m1, m0, 0xE +%elif cpuflag(mmx2) + pshufw m1, m0, 0xE +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 -%elifidn %1, mmx2 - pshufw m1, m0, 0xE -%else ; mmsize == 16 - pshuflw m1, m0, 0xE %endif paddw m0, m1 -%ifidn %1, mmx +%if cpuflag(sse2) + pshuflw m1, m0, 0x1 +%elif cpuflag(mmx2) + pshufw m1, m0, 0x1 +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 -%elifidn %1, mmx2 - pshufw m1, m0, 0x1 -%else - pshuflw m1, m0, 0x1 %endif paddw m0, m1 ; sum of H coefficients @@ -424,13 +431,13 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 mov r0, r0m %endif -%ifidn %3, h264 +%ifidn %1, h264 lea r5, [r5*5+32] sar r5, 6 -%elifidn %3, rv40 +%elifidn %1, rv40 lea r5, [r5*5] sar r5, 6 -%elifidn %3, svq3 +%elifidn %1, svq3 test r5, r5 lea r6, [r5+3] cmovs r5, r6 @@ -449,8 +456,8 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 movd r1d, m0 movsx r1d, r1w -%ifnidn %3, svq3 -%ifidn %3, h264 +%ifnidn %1, svq3 +%ifidn %1, h264 lea r1d, [r1d*5+32] %else ; rv40 lea r1d, [r1d*5] @@ -476,26 +483,26 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 movd m1, r5d movd m3, r3d -%ifidn %1, mmx - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%elifidn %1, mmx2 - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%else +%if cpuflag(sse2) pshuflw m0, m0, 0x0 pshuflw m1, m1, 0x0 pshuflw m3, m3, 0x0 punpcklqdq m0, m0 ; splat H (words) punpcklqdq m1, m1 ; splat V (words) punpcklqdq m3, m3 ; splat a (words) +%elif cpuflag(mmx2) + pshufw m0, m0, 0x0 + pshufw m1, m1, 0x0 + pshufw m3, m3, 0x0 +%elif cpuflag(mmx) + punpcklwd m0, m0 + punpcklwd m1, m1 + punpcklwd m3, m3 + punpckldq m0, m0 + punpckldq m1, m1 + punpckldq m3, m3 %endif -%ifidn %3, svq3 +%ifidn %1, svq3 SWAP 0, 1 %endif mova m2, m0 @@ -568,27 +575,30 @@ cglobal pred16x16_plane_%3_%1, 2, 9, %2 REP_RET %endmacro -INIT_MMX -H264_PRED16x16_PLANE mmx, 0, h264 -H264_PRED16x16_PLANE mmx, 0, rv40 -H264_PRED16x16_PLANE mmx, 0, svq3 -H264_PRED16x16_PLANE mmx2, 0, h264 -H264_PRED16x16_PLANE mmx2, 0, rv40 -H264_PRED16x16_PLANE mmx2, 0, svq3 +INIT_MMX mmx +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_MMX mmx2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM sse2 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 +INIT_XMM ssse3 +H264_PRED16x16_PLANE h264 +H264_PRED16x16_PLANE rv40 +H264_PRED16x16_PLANE svq3 INIT_XMM -H264_PRED16x16_PLANE sse2, 8, h264 -H264_PRED16x16_PLANE sse2, 8, rv40 -H264_PRED16x16_PLANE sse2, 8, svq3 -H264_PRED16x16_PLANE ssse3, 8, h264 -H264_PRED16x16_PLANE ssse3, 8, rv40 -H264_PRED16x16_PLANE ssse3, 8, svq3 ;----------------------------------------------------------------------------- ; void pred8x8_plane(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H264_PRED8x8_PLANE 2 -cglobal pred8x8_plane_%1, 2, 9, %2 +%macro H264_PRED8x8_PLANE 0 +cglobal pred8x8_plane, 2,9,7 mov r2, r1 ; +stride neg r1 ; -stride @@ -601,39 +611,39 @@ cglobal pred8x8_plane_%1, 2, 9, %2 pmullw m0, [pw_m4to4] pmullw m1, [pw_m4to4+8] %else ; mmsize == 16 -%ifidn %1, sse2 +%if cpuflag(ssse3) + movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary + pmaddubsw m0, [plane8_shuf] ; H coefficients +%else ; sse2 pxor m2, m2 movd m1, [r0+r1 +4] punpckldq m0, m1 punpcklbw m0, m2 pmullw m0, [pw_m4to4] -%else ; ssse3 - movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary - pmaddubsw m0, [plane8_shuf] ; H coefficients %endif movhlps m1, m0 %endif paddw m0, m1 -%ifnidn %1, ssse3 -%ifidn %1, mmx +%if notcpuflag(ssse3) +%if cpuflag(sse2) ; mmsize == 16 + pshuflw m1, m0, 0xE +%elif cpuflag(mmx2) + pshufw m1, m0, 0xE +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 32 -%elifidn %1, mmx2 - pshufw m1, m0, 0xE -%else ; mmsize == 16 - pshuflw m1, m0, 0xE %endif paddw m0, m1 %endif ; !ssse3 -%ifidn %1, mmx +%if cpuflag(sse2) + pshuflw m1, m0, 0x1 +%elif cpuflag(mmx2) + pshufw m1, m0, 0x1 +%elif cpuflag(mmx) mova m1, m0 psrlq m1, 16 -%elifidn %1, mmx2 - pshufw m1, m0, 0x1 -%else - pshuflw m1, m0, 0x1 %endif paddw m0, m1 ; sum of H coefficients @@ -701,24 +711,24 @@ cglobal pred8x8_plane_%1, 2, 9, %2 movd m1, r5d movd m3, r3d -%ifidn %1, mmx - punpcklwd m0, m0 - punpcklwd m1, m1 - punpcklwd m3, m3 - punpckldq m0, m0 - punpckldq m1, m1 - punpckldq m3, m3 -%elifidn %1, mmx2 - pshufw m0, m0, 0x0 - pshufw m1, m1, 0x0 - pshufw m3, m3, 0x0 -%else +%if cpuflag(sse2) pshuflw m0, m0, 0x0 pshuflw m1, m1, 0x0 pshuflw m3, m3, 0x0 punpcklqdq m0, m0 ; splat H (words) punpcklqdq m1, m1 ; splat V (words) punpcklqdq m3, m3 ; splat a (words) +%elif cpuflag(mmx2) + pshufw m0, m0, 0x0 + pshufw m1, m1, 0x0 + pshufw m3, m3, 0x0 +%elif cpuflag(mmx) + punpcklwd m0, m0 + punpcklwd m1, m1 + punpcklwd m3, m3 + punpckldq m0, m0 + punpckldq m1, m1 + punpckldq m3, m3 %endif %if mmsize == 8 mova m2, m0 @@ -768,12 +778,15 @@ ALIGN 16 REP_RET %endmacro -INIT_MMX -H264_PRED8x8_PLANE mmx, 0 -H264_PRED8x8_PLANE mmx2, 0 +INIT_MMX mmx +H264_PRED8x8_PLANE +INIT_MMX mmx2 +H264_PRED8x8_PLANE +INIT_XMM sse2 +H264_PRED8x8_PLANE +INIT_XMM ssse3 +H264_PRED8x8_PLANE INIT_XMM -H264_PRED8x8_PLANE sse2, 8 -H264_PRED8x8_PLANE ssse3, 8 ;----------------------------------------------------------------------------- ; void pred8x8_vertical(uint8_t *src, int stride) @@ -795,22 +808,22 @@ cglobal pred8x8_vertical_mmx, 2,2 ; void pred8x8_horizontal(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_H 1 -cglobal pred8x8_horizontal_%1, 2,3 +%macro PRED8x8_H 0 +cglobal pred8x8_horizontal, 2,3 mov r2, 4 -%ifidn %1, ssse3 +%if cpuflag(ssse3) mova m2, [pb_3] %endif .loop: movd m0, [r0+r1*0-4] movd m1, [r0+r1*1-4] -%ifidn %1, ssse3 +%if cpuflag(ssse3) pshufb m0, m2 pshufb m1, m2 %else punpcklbw m0, m0 punpcklbw m1, m1 -%ifidn %1, mmxext +%if cpuflag(mmx2) pshufw m0, m0, 0xff pshufw m1, m1, 0xff %else @@ -828,10 +841,13 @@ cglobal pred8x8_horizontal_%1, 2,3 REP_RET %endmacro +INIT_MMX mmx +PRED8x8_H +INIT_MMX mmx2 +PRED8x8_H +INIT_MMX ssse3 +PRED8x8_H INIT_MMX -PRED8x8_H mmx -PRED8x8_H mmxext -PRED8x8_H ssse3 ;----------------------------------------------------------------------------- ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride) @@ -967,8 +983,8 @@ cglobal pred8x8_dc_rv40_mmxext, 2,7 ; void pred8x8_tm_vp8(uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro PRED8x8_TM_MMX 1 -cglobal pred8x8_tm_vp8_%1, 2,6 +%macro PRED8x8_TM_MMX 0 +cglobal pred8x8_tm_vp8, 2,6 sub r0, r1 pxor mm7, mm7 movq mm0, [r0] @@ -984,14 +1000,14 @@ cglobal pred8x8_tm_vp8_%1, 2,6 sub r3d, r4d movd mm2, r2d movd mm4, r3d -%ifidn %1, mmx +%if cpuflag(mmx2) + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else punpcklwd mm2, mm2 punpcklwd mm4, mm4 punpckldq mm2, mm2 punpckldq mm4, mm4 -%else - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 %endif movq mm3, mm2 movq mm5, mm4 @@ -1009,8 +1025,11 @@ cglobal pred8x8_tm_vp8_%1, 2,6 REP_RET %endmacro -PRED8x8_TM_MMX mmx -PRED8x8_TM_MMX mmxext +INIT_MMX mmx +PRED8x8_TM_MMX +INIT_MMX mmx2 +PRED8x8_TM_MMX +INIT_MMX cglobal pred8x8_tm_vp8_sse2, 2,6,4 sub r0, r1 @@ -2510,8 +2529,8 @@ cglobal pred4x4_dc_mmxext, 3,5 ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride) ;----------------------------------------------------------------------------- -%macro PRED4x4_TM_MMX 1 -cglobal pred4x4_tm_vp8_%1, 3,6 +%macro PRED4x4_TM_MMX 0 +cglobal pred4x4_tm_vp8, 3,6 sub r0, r2 pxor mm7, mm7 movd mm0, [r0] @@ -2525,14 +2544,14 @@ cglobal pred4x4_tm_vp8_%1, 3,6 sub r3d, r4d movd mm2, r1d movd mm4, r3d -%ifidn %1, mmx +%if cpuflag(mmx2) + pshufw mm2, mm2, 0 + pshufw mm4, mm4, 0 +%else punpcklwd mm2, mm2 punpcklwd mm4, mm4 punpckldq mm2, mm2 punpckldq mm4, mm4 -%else - pshufw mm2, mm2, 0 - pshufw mm4, mm4, 0 %endif paddw mm2, mm0 paddw mm4, mm0 @@ -2546,8 +2565,11 @@ cglobal pred4x4_tm_vp8_%1, 3,6 REP_RET %endmacro -PRED4x4_TM_MMX mmx -PRED4x4_TM_MMX mmxext +INIT_MMX mmx +PRED4x4_TM_MMX +INIT_MMX mmx2 +PRED4x4_TM_MMX +INIT_MMX cglobal pred4x4_tm_vp8_ssse3, 3,3 sub r0, r2 |