diff options
author | Michael Niedermayer <michaelni@gmx.at> | 2012-03-05 00:02:58 +0100 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2012-03-05 00:15:55 +0100 |
commit | 2af8f2cea6c94eba3a15820194cb7374b366976a (patch) | |
tree | 634d34b8adf1c35cc1bb7c3eb1f2b49775ffbb56 /libavcodec/x86/vp8dsp.asm | |
parent | 33a183df46355e4b281517e14c9b3c7e2b558dcf (diff) | |
parent | 3faa141d15bf9945fa54331e51b3f10b9970d5d2 (diff) | |
download | ffmpeg-2af8f2cea6c94eba3a15820194cb7374b366976a.tar.gz |
Merge remote-tracking branch 'qatar/master'
* qatar/master: (27 commits)
cmdutils: use new avcodec_is_decoder/encoder() functions.
lavc: make codec_is_decoder/encoder() public.
lavc: deprecate AVCodecContext.sub_id.
libcdio: add a forgotten AVClass to the private context.
swscale: remove "cpu flags" from -sws_flags description.
proresenc: give user a possibility to alter some encoding parameters
vorbisenc: add output buffer overwrite protection
libopencore-amrnbenc: fix end-of-stream handling
ra144enc: fix end-of-stream handling
nellymoserenc: zero any leftover packet bytes
nellymoserenc: use proper MDCT overlap delay
qpeg: Use bytestream2 functions to prevent buffer overreads.
swscale: make %rep unconditional.
vp8: convert simple loopfilter x86 assembly to use named arguments.
vp8: convert idct x86 assembly to use named arguments.
vp8: convert mc x86 assembly to use named arguments.
vp8: convert loopfilter x86 assembly to use cpuflags().
vp8: convert idct/mc x86 assembly to use cpuflags().
swscale: remove now unnecessary hack.
x86inc: don't "bake" stack_offset in named arguments.
...
Conflicts:
cmdutils.c
doc/APIchanges
libavcodec/mpeg12.c
libavcodec/options.c
libavcodec/qpeg.c
libavcodec/utils.c
libavcodec/version.h
libavdevice/libcdio.c
tests/lavf-regression.sh
Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/vp8dsp.asm')
-rw-r--r-- | libavcodec/x86/vp8dsp.asm | 1246 |
1 files changed, 611 insertions, 635 deletions
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 140597031f..05a2a5712b 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -116,23 +116,25 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %ifdef PIC -%define fourtap_filter_hw r11 -%define sixtap_filter_hw r11 -%define fourtap_filter_hb r11 -%define sixtap_filter_hb r11 -%define fourtap_filter_v r11 -%define sixtap_filter_v r11 -%define bilinear_filter_vw r11 -%define bilinear_filter_vb r11 +%define fourtap_filter_hw picregq +%define sixtap_filter_hw picregq +%define fourtap_filter_hb picregq +%define sixtap_filter_hb picregq +%define fourtap_filter_v picregq +%define sixtap_filter_v picregq +%define bilinear_filter_vw picregq +%define bilinear_filter_vb picregq +%define npicregs 1 %else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m -%define fourtap_filter_hb fourtap_filter_hb_m -%define sixtap_filter_hb sixtap_filter_hb_m -%define fourtap_filter_v fourtap_filter_v_m -%define sixtap_filter_v sixtap_filter_v_m +%define fourtap_filter_hw fourtap_filter_hw_m +%define sixtap_filter_hw sixtap_filter_hw_m +%define fourtap_filter_hb fourtap_filter_hb_m +%define sixtap_filter_hb sixtap_filter_hb_m +%define fourtap_filter_v fourtap_filter_v_m +%define sixtap_filter_v sixtap_filter_v_m %define bilinear_filter_vw bilinear_filter_vw_m %define bilinear_filter_vb bilinear_filter_vb_m +%define npicregs 0 %endif filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 @@ -173,26 +175,26 @@ SECTION .text ; int height, int mx, int my); ;----------------------------------------------------------------------------- -%macro FILTER_SSSE3 3 -cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 - lea r5d, [r5*3] +%macro FILTER_SSSE3 1 +cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] mova m3, [filter_h6_shuf2] mova m4, [filter_h6_shuf3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes - mova m6, [sixtap_filter_hb+r5*8-32] - mova m7, [sixtap_filter_hb+r5*8-16] + mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes + mova m6, [sixtap_filter_hb+mxq*8-32] + mova m7, [sixtap_filter_hb+mxq*8-16] .nextrow - movu m0, [r2-2] + movu m0, [srcq-2] mova m1, m0 mova m2, m0 -%ifidn %1, 4 +%if mmsize == 8 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the ; shuffle with a memory operand - punpcklbw m0, [r2+3] + punpcklbw m0, [srcq+3] %else pshufb m0, [filter_h6_shuf1] %endif @@ -206,28 +208,28 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 paddsw m0, [pw_64] psraw m0, 7 packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 - shl r5d, 4 +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 mova m2, [pw_64] mova m3, [filter_h2_shuf] mova m4, [filter_h4_shuf] %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes - mova m6, [fourtap_filter_hb+r5] + mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes + mova m6, [fourtap_filter_hb+mxq] .nextrow - movu m0, [r2-1] + movu m0, [srcq-1] mova m1, m0 pshufb m0, m3 pshufb m1, m4 @@ -237,33 +239,33 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 paddsw m0, m1 psraw m0, 7 packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 - shl r6d, 4 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r6-16] - mova m6, [fourtap_filter_hb+r6] + mova m5, [fourtap_filter_hb+myq-16] + mova m6, [fourtap_filter_hb+myq] mova m7, [pw_64] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq .nextrow - movh m3, [r2+2*r3] ; read new row + movh m3, [srcq+2*srcstrideq] ; read new row mova m4, m0 mova m0, m1 punpcklbw m4, m1 @@ -276,44 +278,44 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 paddsw m4, m7 psraw m4, 7 packuswb m4, m4 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 - lea r6d, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + lea myd, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - lea r6, [sixtap_filter_hb+r6*8] + lea myq, [sixtap_filter_hb+myq*8] ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] .nextrow - movh m5, [r2+2*r3] ; read new row + movh m5, [srcq+2*srcstrideq] ; read new row mova m6, m0 punpcklbw m6, m5 mova m0, m1 punpcklbw m1, m2 mova m7, m3 punpcklbw m7, m4 - pmaddubsw m6, [r6-48] - pmaddubsw m1, [r6-32] - pmaddubsw m7, [r6-16] + pmaddubsw m6, [myq-48] + pmaddubsw m1, [myq-32] + pmaddubsw m7, [myq-16] paddsw m6, m1 paddsw m6, m7 mova m1, m2 @@ -323,34 +325,35 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 mova m3, m4 packuswb m6, m6 mova m4, m5 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_SSSE3 4, 0, 0 -INIT_XMM -FILTER_SSSE3 8, 8, 7 +INIT_MMX ssse3 +FILTER_SSSE3 4 +INIT_XMM ssse3 +FILTER_SSSE3 8 ; 4x4 block, H-only 4-tap filter -cglobal put_vp8_epel4_h4_mmxext, 6, 6 - shl r5d, 4 +INIT_MMX mmx2 +cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [fourtap_filter_hw_m] + lea picregq, [fourtap_filter_hw_m] %endif - movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+r5] + movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words + movq mm5, [fourtap_filter_hw+mxq] movq mm7, [pw_64] pxor mm6, mm6 .nextrow - movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels + movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -376,29 +379,30 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6 paddsw mm3, mm7 ; rounding psraw mm3, 7 packuswb mm3, mm6 ; clip and word->bytes - movd [r0], mm3 ; store + movd [dstq], mm3 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, H-only 6-tap filter -cglobal put_vp8_epel4_h6_mmxext, 6, 6 - lea r5d, [r5*3] +INIT_MMX mmx2 +cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] %ifdef PIC - lea r11, [sixtap_filter_hw_m] + lea picregq, [sixtap_filter_hw_m] %endif - movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+r5*8-32] - movq mm6, [sixtap_filter_hw+r5*8-16] + movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words + movq mm5, [sixtap_filter_hw+mxq*8-32] + movq mm6, [sixtap_filter_hw+mxq*8-16] movq mm7, [pw_64] pxor mm3, mm3 .nextrow - movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels + movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -418,7 +422,7 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 paddd mm1, mm2 ; finish 1st 2px ; second set of 2 pixels, use backup of above - movd mm2, [r2+3] ; byte FGHI (prevent overreads) + movd mm2, [srcq+3] ; byte FGHI (prevent overreads) pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 paddd mm0, mm3 ; add to 2nd 2px cache @@ -433,35 +437,35 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 paddsw mm1, mm7 ; rounding psraw mm1, 7 packuswb mm1, mm3 ; clip and word->bytes - movd [r0], mm1 ; store + movd [dstq], mm1 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -INIT_XMM -cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 - shl r5d, 5 +INIT_XMM sse2 +cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 5 %ifdef PIC - lea r11, [fourtap_filter_v_m] + lea picregq, [fourtap_filter_v_m] %endif - lea r5, [fourtap_filter_v+r5-32] + lea mxq, [fourtap_filter_v+mxq-32] pxor m7, m7 mova m4, [pw_64] - mova m5, [r5+ 0] - mova m6, [r5+16] + mova m5, [mxq+ 0] + mova m6, [mxq+16] %ifdef m8 - mova m8, [r5+32] - mova m9, [r5+48] + mova m8, [mxq+32] + mova m9, [mxq+48] %endif .nextrow - movq m0, [r2-1] - movq m1, [r2-0] - movq m2, [r2+1] - movq m3, [r2+2] + movq m0, [srcq-1] + movq m1, [srcq-0] + movq m2, [srcq+1] + movq m3, [srcq+2] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -472,8 +476,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 pmullw m2, m8 pmullw m3, m9 %else - pmullw m2, [r5+32] - pmullw m3, [r5+48] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] %endif paddsw m0, m1 paddsw m2, m3 @@ -481,39 +485,40 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 paddsw m0, m4 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 - lea r5d, [r5*3] - shl r5d, 4 +INIT_XMM sse2 +cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + shl mxd, 4 %ifdef PIC - lea r11, [sixtap_filter_v_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r5, [sixtap_filter_v+r5-96] + lea mxq, [sixtap_filter_v+mxq-96] pxor m7, m7 mova m6, [pw_64] %ifdef m8 - mova m8, [r5+ 0] - mova m9, [r5+16] - mova m10, [r5+32] - mova m11, [r5+48] - mova m12, [r5+64] - mova m13, [r5+80] + mova m8, [mxq+ 0] + mova m9, [mxq+16] + mova m10, [mxq+32] + mova m11, [mxq+48] + mova m12, [mxq+64] + mova m13, [mxq+80] %endif .nextrow - movq m0, [r2-2] - movq m1, [r2-1] - movq m2, [r2-0] - movq m3, [r2+1] - movq m4, [r2+2] - movq m5, [r2+3] + movq m0, [srcq-2] + movq m1, [srcq-1] + movq m2, [srcq-0] + movq m3, [srcq+1] + movq m4, [srcq+2] + movq m5, [srcq+3] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -528,12 +533,12 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 pmullw m4, m12 pmullw m5, m13 %else - pmullw m0, [r5+ 0] - pmullw m1, [r5+16] - pmullw m2, [r5+32] - pmullw m3, [r5+48] - pmullw m4, [r5+64] - pmullw m5, [r5+80] + pmullw m0, [mxq+ 0] + pmullw m1, [mxq+16] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] + pmullw m4, [mxq+64] + pmullw m5, [mxq+80] %endif paddsw m1, m4 paddsw m0, m5 @@ -543,52 +548,52 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 paddsw m0, m6 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -%macro FILTER_V 3 +%macro FILTER_V 1 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 - shl r6d, 5 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 5 %ifdef PIC - lea r11, [fourtap_filter_v_m] + lea picregq, [fourtap_filter_v_m] %endif - lea r6, [fourtap_filter_v+r6-32] + lea myq, [fourtap_filter_v+myq-32] mova m6, [pw_64] pxor m7, m7 - mova m5, [r6+48] + mova m5, [myq+48] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 .nextrow ; first calculate negative taps (to prevent losing positive overflows) - movh m4, [r2+2*r3] ; read new row + movh m4, [srcq+2*srcstrideq] ; read new row punpcklbw m4, m7 mova m3, m4 - pmullw m0, [r6+0] + pmullw m0, [myq+0] pmullw m4, m5 paddsw m4, m0 ; then calculate positive taps mova m0, m1 - pmullw m1, [r6+16] + pmullw m1, [myq+16] paddsw m4, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m4, m2 mova m2, m3 @@ -596,36 +601,36 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 paddsw m4, m6 psraw m4, 7 packuswb m4, m7 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 - shl r6d, 4 - lea r6, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 + lea myq, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_v_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r6, [sixtap_filter_v+r6-96] + lea myq, [sixtap_filter_v+myq-96] pxor m7, m7 ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -635,62 +640,61 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 .nextrow ; first calculate negative taps (to prevent losing positive overflows) mova m5, m1 - pmullw m5, [r6+16] + pmullw m5, [myq+16] mova m6, m4 - pmullw m6, [r6+64] + pmullw m6, [myq+64] paddsw m6, m5 ; then calculate positive taps - movh m5, [r2+2*r3] ; read new row + movh m5, [srcq+2*srcstrideq] ; read new row punpcklbw m5, m7 - pmullw m0, [r6+0] + pmullw m0, [myq+0] paddsw m6, m0 mova m0, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m6, m2 mova m2, m3 - pmullw m3, [r6+48] + pmullw m3, [myq+48] paddsw m6, m3 mova m3, m4 mova m4, m5 - pmullw m5, [r6+80] + pmullw m5, [myq+80] paddsw m6, m5 ; round/clip/store paddsw m6, [pw_64] psraw m6, 7 packuswb m6, m7 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_V mmxext, 4, 0 -INIT_XMM -FILTER_V sse2, 8, 8 +INIT_MMX mmx2 +FILTER_V 4 +INIT_XMM sse2 +FILTER_V 8 -%macro FILTER_BILINEAR 3 -cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 - mov r5d, 8*16 - shl r6d, 4 - sub r5d, r6d +%macro FILTER_BILINEAR 1 +cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r5-16] - mova m5, [bilinear_filter_vw+r6-16] + mova m5, [bilinear_filter_vw+myq-1*16] + neg myq + mova m4, [bilinear_filter_vw+myq+7*16] .nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m3, [r2+r3*2] + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m3, [srcq+srcstrideq*2] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m3, m6 @@ -705,38 +709,37 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 - mov r6d, 8*16 - shl r5d, 4 - sub r6d, r5d +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r6-16] - mova m5, [bilinear_filter_vw+r5-16] + mova m5, [bilinear_filter_vw+mxq-1*16] + neg mxq + mova m4, [bilinear_filter_vw+mxq+7*16] .nextrow - movh m0, [r2+r3*0+0] - movh m1, [r2+r3*0+1] - movh m2, [r2+r3*1+0] - movh m3, [r2+r3*1+1] + movh m0, [srcq+srcstrideq*0+0] + movh m1, [srcq+srcstrideq*0+1] + movh m2, [srcq+srcstrideq*1+0] + movh m3, [srcq+srcstrideq*1+1] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m2, m6 @@ -751,41 +754,41 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 psraw m2, 2 pavgw m0, m6 pavgw m2, m6 -%ifidn %1, mmxext +%if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro -INIT_MMX -FILTER_BILINEAR mmxext, 4, 0 -INIT_XMM -FILTER_BILINEAR sse2, 8, 7 +INIT_MMX mmx2 +FILTER_BILINEAR 4 +INIT_XMM sse2 +FILTER_BILINEAR 8 %macro FILTER_BILINEAR_SSSE3 1 -cglobal put_vp8_bilinear%1_v_ssse3, 7,7 - shl r6d, 4 +cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 - mova m3, [bilinear_filter_vb+r6-16] + mova m3, [bilinear_filter_vb+myq-16] .nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m2, [r2+r3*2] + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m2, [srcq+srcstrideq*2] punpcklbw m0, m1 punpcklbw m1, m2 pmaddubsw m0, m3 @@ -797,31 +800,31 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%1_h_ssse3, 7,7 - shl r5d, 4 +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 mova m2, [filter_h2_shuf] - mova m3, [bilinear_filter_vb+r5-16] + mova m3, [bilinear_filter_vb+mxq-16] .nextrow - movu m0, [r2+r3*0] - movu m1, [r2+r3*1] + movu m0, [srcq+srcstrideq*0] + movu m1, [srcq+srcstrideq*1] pshufb m0, m2 pshufb m1, m2 pmaddubsw m0, m3 @@ -833,65 +836,68 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro -INIT_MMX +INIT_MMX ssse3 FILTER_BILINEAR_SSSE3 4 -INIT_XMM +INIT_XMM ssse3 FILTER_BILINEAR_SSSE3 8 -cglobal put_vp8_pixels8_mmx, 5,5 +INIT_MMX mmx +cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0] - movq mm1, [r2+r3*1] - lea r2, [r2+r3*2] - movq [r0+r1*0], mm0 - movq [r0+r1*1], mm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0] + movq mm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0], mm0 + movq [dstq+dststrideq*1], mm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET %if ARCH_X86_32 -cglobal put_vp8_pixels16_mmx, 5,5 +INIT_MMX mmx +cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0+0] - movq mm1, [r2+r3*0+8] - movq mm2, [r2+r3*1+0] - movq mm3, [r2+r3*1+8] - lea r2, [r2+r3*2] - movq [r0+r1*0+0], mm0 - movq [r0+r1*0+8], mm1 - movq [r0+r1*1+0], mm2 - movq [r0+r1*1+8], mm3 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0+0] + movq mm1, [srcq+srcstrideq*0+8] + movq mm2, [srcq+srcstrideq*1+0] + movq mm3, [srcq+srcstrideq*1+8] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0+0], mm0 + movq [dstq+dststrideq*0+8], mm1 + movq [dstq+dststrideq*1+0], mm2 + movq [dstq+dststrideq*1+8], mm3 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endif -cglobal put_vp8_pixels16_sse, 5,5,2 +INIT_XMM sse +cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height .nextrow: - movups xmm0, [r2+r3*0] - movups xmm1, [r2+r3*1] - lea r2, [r2+r3*2] - movaps [r0+r1*0], xmm0 - movaps [r0+r1*1], xmm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movups xmm0, [srcq+srcstrideq*0] + movups xmm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movaps [dstq+dststrideq*0], xmm0 + movaps [dstq+dststrideq*1], xmm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET @@ -900,10 +906,10 @@ cglobal put_vp8_pixels16_sse, 5,5,2 ;----------------------------------------------------------------------------- %macro ADD_DC 4 - %4 m2, [r0+%3] - %4 m3, [r0+r2+%3] - %4 m4, [r1+%3] - %4 m5, [r1+r2+%3] + %4 m2, [dst1q+%3] + %4 m3, [dst1q+strideq+%3] + %4 m4, [dst2q+%3] + %4 m5, [dst2q+strideq+%3] paddusb m2, %1 paddusb m3, %1 paddusb m4, %1 @@ -912,22 +918,22 @@ cglobal put_vp8_pixels16_sse, 5,5,2 psubusb m3, %2 psubusb m4, %2 psubusb m5, %2 - %4 [r0+%3], m2 - %4 [r0+r2+%3], m3 - %4 [r1+%3], m4 - %4 [r1+r2+%3], m5 + %4 [dst1q+%3], m2 + %4 [dst1q+strideq+%3], m3 + %4 [dst2q+%3], m4 + %4 [dst2q+strideq+%3], m5 %endmacro -INIT_MMX -cglobal vp8_idct_dc_add_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] ; calculate DC paddw m0, [pw_4] pxor m1, m1 psraw m0, 3 - movd [r1], m1 + movd [blockq], m1 psubw m1, m0 packuswb m0, m0 packuswb m1, m1 @@ -937,24 +943,26 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 punpcklwd m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, movh RET -INIT_XMM -cglobal vp8_idct_dc_add_sse4, 3, 3, 6 +INIT_XMM sse4 +cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1], m1 - lea r1, [r0+r2*2] - movd m2, [r0] - movd m3, [r0+r2] - movd m4, [r1] - movd m5, [r1+r2] + movd [blockq], m1 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + movd m2, [dst1q] + movd m3, [dst1q+strideq] + movd m4, [dst2q] + movd m5, [dst2q+strideq] psraw m0, 3 pshuflw m0, m0, 0 punpcklqdq m0, m0 @@ -965,10 +973,10 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 paddw m2, m0 paddw m4, m0 packuswb m2, m4 - movd [r0], m2 - pextrd [r0+r2], m2, 1 - pextrd [r1], m2, 2 - pextrd [r1+r2], m2, 3 + movd [dst1q], m2 + pextrd [dst1q+strideq], m2, 1 + pextrd [dst2q], m2, 2 + pextrd [dst2q+strideq], m2, 3 RET ;----------------------------------------------------------------------------- @@ -976,22 +984,22 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 ;----------------------------------------------------------------------------- %if ARCH_X86_32 -INIT_MMX -cglobal vp8_idct_dc_add4y_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1006,28 +1014,29 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova ADD_DC m1, m7, 8, mova RET %endif -INIT_XMM -cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 +INIT_XMM sse2 +cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m1 - movd [r1+32*1], m1 - movd [r1+32*2], m1 - movd [r1+32*3], m1 + movd [blockq+32*0], m1 + movd [blockq+32*1], m1 + movd [blockq+32*2], m1 + movd [blockq+32*3], m1 psraw m0, 3 psubw m1, m0 packuswb m0, m0 @@ -1038,7 +1047,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 punpcklbw m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, mova RET @@ -1046,22 +1056,22 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); ;----------------------------------------------------------------------------- -INIT_MMX -cglobal vp8_idct_dc_add4uv_mmx, 3, 3 +INIT_MMX mmx +cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1076,10 +1086,11 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova - lea r0, [r0+r2*4] - lea r1, [r1+r2*4] + lea dst1q, [dst1q+strideq*4] + lea dst2q, [dst2q+strideq*4] ADD_DC m1, m7, 0, mova RET @@ -1118,26 +1129,25 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 SWAP %4, %3 %endmacro -INIT_MMX -%macro VP8_IDCT_ADD 1 -cglobal vp8_idct_add_%1, 3, 3 +%macro VP8_IDCT_ADD 0 +cglobal vp8_idct_add, 3, 3, 0, dst, block, stride ; load block data - movq m0, [r1+ 0] - movq m1, [r1+ 8] - movq m2, [r1+16] - movq m3, [r1+24] + movq m0, [blockq+ 0] + movq m1, [blockq+ 8] + movq m2, [blockq+16] + movq m3, [blockq+24] movq m6, [pw_20091] movq m7, [pw_17734] -%ifidn %1, sse +%if cpuflag(sse) xorps xmm0, xmm0 - movaps [r1+ 0], xmm0 - movaps [r1+16], xmm0 + movaps [blockq+ 0], xmm0 + movaps [blockq+16], xmm0 %else pxor m4, m4 - movq [r1+ 0], m4 - movq [r1+ 8], m4 - movq [r1+16], m4 - movq [r1+24], m4 + movq [blockq+ 0], m4 + movq [blockq+ 8], m4 + movq [blockq+16], m4 + movq [blockq+24], m4 %endif ; actual IDCT @@ -1149,41 +1159,44 @@ cglobal vp8_idct_add_%1, 3, 3 ; store pxor m4, m4 - lea r1, [r0+2*r2] - STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 - STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+2*strideq] + STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq + STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq RET %endmacro %if ARCH_X86_32 -VP8_IDCT_ADD mmx +INIT_MMX mmx +VP8_IDCT_ADD %endif -VP8_IDCT_ADD sse +INIT_MMX sse +VP8_IDCT_ADD ;----------------------------------------------------------------------------- ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) ;----------------------------------------------------------------------------- %macro SCATTER_WHT 3 - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(0+%3)], r1w - mov [r0+2*16*(1+%3)], r2w - shr r1d, 16 - shr r2d, 16 + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(0+%3)], dc1w + mov [blockq+2*16*(1+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 psrlq m%1, 32 psrlq m%2, 32 - mov [r0+2*16*(4+%3)], r1w - mov [r0+2*16*(5+%3)], r2w - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(8+%3)], r1w - mov [r0+2*16*(9+%3)], r2w - shr r1d, 16 - shr r2d, 16 - mov [r0+2*16*(12+%3)], r1w - mov [r0+2*16*(13+%3)], r2w + mov [blockq+2*16*(4+%3)], dc1w + mov [blockq+2*16*(5+%3)], dc2w + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(8+%3)], dc1w + mov [blockq+2*16*(9+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + mov [blockq+2*16*(12+%3)], dc1w + mov [blockq+2*16*(13+%3)], dc2w %endmacro %macro HADAMARD4_1D 4 @@ -1192,22 +1205,22 @@ VP8_IDCT_ADD sse SWAP %1, %4, %3 %endmacro -%macro VP8_DC_WHT 1 -cglobal vp8_luma_dc_wht_%1, 2,3 - movq m0, [r1] - movq m1, [r1+8] - movq m2, [r1+16] - movq m3, [r1+24] -%ifidn %1, sse +%macro VP8_DC_WHT 0 +cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 + movq m0, [dc1q] + movq m1, [dc1q+8] + movq m2, [dc1q+16] + movq m3, [dc1q+24] +%if cpuflag(sse) xorps xmm0, xmm0 - movaps [r1+ 0], xmm0 - movaps [r1+16], xmm0 + movaps [dc1q+ 0], xmm0 + movaps [dc1q+16], xmm0 %else pxor m4, m4 - movq [r1+ 0], m4 - movq [r1+ 8], m4 - movq [r1+16], m4 - movq [r1+24], m4 + movq [dc1q+ 0], m4 + movq [dc1q+ 8], m4 + movq [dc1q+16], m4 + movq [dc1q+24], m4 %endif HADAMARD4_1D 0, 1, 2, 3 TRANSPOSE4x4W 0, 1, 2, 3, 4 @@ -1222,11 +1235,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3 RET %endmacro -INIT_MMX %if ARCH_X86_32 -VP8_DC_WHT mmx +INIT_MMX mmx +VP8_DC_WHT %endif -VP8_DC_WHT sse +INIT_MMX sse +VP8_DC_WHT ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); @@ -1414,7 +1428,17 @@ VP8_DC_WHT sse add %4, %5 %endmacro -%macro WRITE_8W_SSE2 5 +%macro WRITE_8W 5 +%if cpuflag(sse4) + pextrw [%3+%4*4], %1, 0 + pextrw [%2+%4*4], %1, 1 + pextrw [%3+%4*2], %1, 2 + pextrw [%3+%4 ], %1, 3 + pextrw [%3 ], %1, 4 + pextrw [%2 ], %1, 5 + pextrw [%2+%5 ], %1, 6 + pextrw [%2+%5*2], %1, 7 +%else movd %2d, %1 psrldq %1, 4 mov [%3+%4*4], %2w @@ -1440,79 +1464,68 @@ VP8_DC_WHT sse mov [%3+%5 ], %2w shr %2, 16 mov [%3+%5*2], %2w +%endif %endmacro -%macro WRITE_8W_SSE4 5 - pextrw [%3+%4*4], %1, 0 - pextrw [%2+%4*4], %1, 1 - pextrw [%3+%4*2], %1, 2 - pextrw [%3+%4 ], %1, 3 - pextrw [%3 ], %1, 4 - pextrw [%2 ], %1, 5 - pextrw [%2+%5 ], %1, 6 - pextrw [%2+%5*2], %1, 7 -%endmacro - -%macro SPLATB_REG_MMX 2-3 +%macro SPLATB_REG 2-3 +%if cpuflag(ssse3) + movd %1, %2d + pshufb %1, %3 +%elif cpuflag(sse2) movd %1, %2d punpcklbw %1, %1 - punpcklwd %1, %1 - punpckldq %1, %1 -%endmacro - -%macro SPLATB_REG_MMXEXT 2-3 + pshuflw %1, %1, 0x0 + punpcklqdq %1, %1 +%elif cpuflag(mmx2) movd %1, %2d punpcklbw %1, %1 pshufw %1, %1, 0x0 -%endmacro - -%macro SPLATB_REG_SSE2 2-3 +%else movd %1, %2d punpcklbw %1, %1 - pshuflw %1, %1, 0x0 - punpcklqdq %1, %1 -%endmacro - -%macro SPLATB_REG_SSSE3 3 - movd %1, %2d - pshufb %1, %3 + punpcklwd %1, %1 + punpckldq %1, %1 +%endif %endmacro -%macro SIMPLE_LOOPFILTER 4 -cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 +%macro SIMPLE_LOOPFILTER 2 +cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr %if mmsize == 8 ; mmx/mmxext - mov r3, 2 + mov cntrq, 2 %endif -%ifnidn %1, sse2 -%if mmsize == 16 +%if cpuflag(ssse3) pxor m0, m0 %endif -%endif - SPLATB_REG m7, r2, m0 ; splat "flim" into register + SPLATB_REG m7, flim, m0 ; splat "flim" into register ; set up indexes to address 4 rows - mov r2, r1 - neg r1 -%ifidn %2, h - lea r0, [r0+4*r2-2] +%if mmsize == 8 + DEFINE_ARGS dst1, mstride, stride, cntr, dst2 +%else + DEFINE_ARGS dst1, mstride, stride, dst3, dst2 +%endif + mov strideq, mstrideq + neg mstrideq +%ifidn %1, h + lea dst1q, [dst1q+4*strideq-2] %endif %if mmsize == 8 ; mmx / mmxext .next8px %endif -%ifidn %2, v +%ifidn %1, v ; read 4 half/full rows of pixels - mova m0, [r0+r1*2] ; p1 - mova m1, [r0+r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r2] ; q1 + mova m0, [dst1q+mstrideq*2] ; p1 + mova m1, [dst1q+mstrideq] ; p0 + mova m2, [dst1q] ; q0 + mova m3, [dst1q+ strideq] ; q1 %else ; h - lea r4, [r0+r2] + lea dst2q, [dst1q+ strideq] %if mmsize == 8 ; mmx/mmxext - READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 + READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq %else ; sse2 - READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 + READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q %endif TRANSPOSE4x4W 0, 1, 2, 3, 4 %endif @@ -1581,36 +1594,36 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 psubusb m6, m3 ; p0+f2 ; store -%ifidn %2, v - mova [r0], m4 - mova [r0+r1], m6 +%ifidn %1, v + mova [dst1q], m4 + mova [dst1q+mstrideq], m6 %else ; h - inc r0 + inc dst1q SBUTTERFLY bw, 6, 4, 0 %if mmsize == 16 ; sse2 -%ifidn %1, sse4 - inc r4 +%if cpuflag(sse4) + inc dst2q %endif - WRITE_8W m6, r4, r0, r1, r2 - lea r4, [r3+r1+1] -%ifidn %1, sse4 - inc r3 + WRITE_8W m6, dst2q, dst1q, mstrideq, strideq + lea dst2q, [dst3q+mstrideq+1] +%if cpuflag(sse4) + inc dst3q %endif - WRITE_8W m4, r3, r4, r1, r2 + WRITE_8W m4, dst3q, dst2q, mstrideq, strideq %else ; mmx/mmxext - WRITE_2x4W m6, m4, r4, r0, r1, r2 + WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq %endif %endif %if mmsize == 8 ; mmx/mmxext ; next 8 pixels -%ifidn %2, v - add r0, 8 ; advance 8 cols = pixels +%ifidn %1, v + add dst1q, 8 ; advance 8 cols = pixels %else ; h - lea r0, [r0+r2*8-1] ; advance 8 rows = lines + lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines %endif - dec r3 + dec cntrq jg .next8px REP_RET %else ; sse2 @@ -1619,41 +1632,38 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 %endmacro %if ARCH_X86_32 -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -SIMPLE_LOOPFILTER mmx, v, 4, 0 -SIMPLE_LOOPFILTER mmx, h, 5, 0 -%define SPLATB_REG SPLATB_REG_MMXEXT -SIMPLE_LOOPFILTER mmxext, v, 4, 0 -SIMPLE_LOOPFILTER mmxext, h, 5, 0 -%endif - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -%define WRITE_8W WRITE_8W_SSE2 -SIMPLE_LOOPFILTER sse2, v, 3, 8 -SIMPLE_LOOPFILTER sse2, h, 5, 8 -%define SPLATB_REG SPLATB_REG_SSSE3 -SIMPLE_LOOPFILTER ssse3, v, 3, 8 -SIMPLE_LOOPFILTER ssse3, h, 5, 8 -%define WRITE_8W WRITE_8W_SSE4 -SIMPLE_LOOPFILTER sse4, h, 5, 8 +INIT_MMX mmx +SIMPLE_LOOPFILTER v, 4 +SIMPLE_LOOPFILTER h, 5 +INIT_MMX mmx2 +SIMPLE_LOOPFILTER v, 4 +SIMPLE_LOOPFILTER h, 5 +%endif + +INIT_XMM sse2 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM ssse3 +SIMPLE_LOOPFILTER v, 3 +SIMPLE_LOOPFILTER h, 5 +INIT_XMM sse4 +SIMPLE_LOOPFILTER h, 5 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro INNER_LOOPFILTER 5 -%if %4 == 8 ; chroma -cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 +%macro INNER_LOOPFILTER 3 +%if %3 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_inner, 6, %2, 13 %define dst8_reg r1 %define mstride_reg r2 %define E_reg r3 %define I_reg r4 %define hev_thr_reg r5 %else ; luma -cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 +cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13 %define mstride_reg r1 %define E_reg r2 %define I_reg r3 @@ -1673,11 +1683,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %define stack_reg hev_thr_reg %endif -%ifnidn %1, sse2 -%if mmsize == 16 +%if cpuflag(ssse3) pxor m7, m7 %endif -%endif %ifndef m8 ; mmx/mmxext or sse2 on x86-32 ; splat function arguments @@ -1688,7 +1696,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 ; align stack mov stack_reg, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack -%ifidn %2, v +%ifidn %1, v sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr ; [3]=hev() result %else ; h @@ -1721,14 +1729,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh %endif -%if mmsize == 8 && %4 == 16 ; mmx/mmxext +%if mmsize == 8 && %3 == 16 ; mmx/mmxext mov cnt_reg, 2 %endif mov stride_reg, mstride_reg neg mstride_reg -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*4-4] -%if %4 == 8 +%if %3 == 8 lea dst8_reg, [dst8_reg+ stride_reg*4-4] %endif %endif @@ -1738,8 +1746,8 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %endif ; read lea dst2_reg, [dst_reg + stride_reg] -%ifidn %2, v -%if %4 == 8 && mmsize == 16 +%ifidn %1, v +%if %3 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova @@ -1750,7 +1758,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 movrow m5, [dst2_reg] ; q1 movrow m6, [dst2_reg+ stride_reg] ; q2 movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m0, [dst8_reg+mstride_reg*4] movhps m2, [dst8_reg+mstride_reg*2] add dst8_reg, stride_reg @@ -1787,7 +1795,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %4 == 16 +%if %3 == 16 lea dst8_reg, [dst_reg + stride_reg*8] %endif @@ -1874,7 +1882,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m4, flim_I pxor m3, m3 psubusb m0, m4 @@ -1896,9 +1904,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero -%ifidn %2, v +%ifidn %1, v movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m3, [dst8_reg+mstride_reg] %endif %elifdef m12 @@ -1914,7 +1922,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 psubusb m6, hev_thr @@ -1928,9 +1936,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %endif SWAP 6, 4 ; now m6 is I -%ifidn %2, v +%ifidn %1, v movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m4, [dst8_reg] %endif %elifdef m8 @@ -1945,7 +1953,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 psubusb m7, hev_thr @@ -2053,14 +2061,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %else mova m6, mask_res %endif -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, [pb_1] %else ; mmxext/sse2 pxor m7, m7 %endif pand m0, m6 pand m1, m6 -%ifidn %1, mmx +%if notcpuflag(mmx2) paddusb m0, m7 pand m1, [pb_FE] pandn m7, m0 @@ -2078,12 +2086,12 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 paddusb m2, m0 ; p1+a ; store -%ifidn %2, v +%ifidn %1, v movrow [dst_reg +mstride_reg*2], m2 movrow [dst_reg +mstride_reg ], m3 movrow [dst_reg], m4 movrow [dst_reg + stride_reg ], m5 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps [dst8_reg+mstride_reg*2], m2 movhps [dst8_reg+mstride_reg ], m3 movhps [dst8_reg], m4 @@ -2100,20 +2108,20 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+2] - WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 + WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3 %endif %endif %if mmsize == 8 -%if %4 == 8 ; chroma -%ifidn %2, h +%if %3 == 8 ; chroma +%ifidn %1, h sub dst_reg, 2 %endif cmp dst_reg, dst8_reg mov dst_reg, dst8_reg jnz .next8px %else -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*8-2] %else ; v add dst_reg, 8 @@ -2130,56 +2138,46 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 %endmacro %if ARCH_X86_32 -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -INNER_LOOPFILTER mmx, v, 6, 16, 0 -INNER_LOOPFILTER mmx, h, 6, 16, 0 -INNER_LOOPFILTER mmx, v, 6, 8, 0 -INNER_LOOPFILTER mmx, h, 6, 8, 0 - -%define SPLATB_REG SPLATB_REG_MMXEXT -INNER_LOOPFILTER mmxext, v, 6, 16, 0 -INNER_LOOPFILTER mmxext, h, 6, 16, 0 -INNER_LOOPFILTER mmxext, v, 6, 8, 0 -INNER_LOOPFILTER mmxext, h, 6, 8, 0 -%endif - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -INNER_LOOPFILTER sse2, v, 5, 16, 13 -%ifdef m8 -INNER_LOOPFILTER sse2, h, 5, 16, 13 -%else -INNER_LOOPFILTER sse2, h, 6, 16, 13 -%endif -INNER_LOOPFILTER sse2, v, 6, 8, 13 -INNER_LOOPFILTER sse2, h, 6, 8, 13 - -%define SPLATB_REG SPLATB_REG_SSSE3 -INNER_LOOPFILTER ssse3, v, 5, 16, 13 -%ifdef m8 -INNER_LOOPFILTER ssse3, h, 5, 16, 13 -%else -INNER_LOOPFILTER ssse3, h, 6, 16, 13 -%endif -INNER_LOOPFILTER ssse3, v, 6, 8, 13 -INNER_LOOPFILTER ssse3, h, 6, 8, 13 +INIT_MMX mmx +INNER_LOOPFILTER v, 6, 16 +INNER_LOOPFILTER h, 6, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 + +INIT_MMX mmx2 +INNER_LOOPFILTER v, 6, 16 +INNER_LOOPFILTER h, 6, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 +%endif + +INIT_XMM sse2 +INNER_LOOPFILTER v, 5, 16 +INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 + +INIT_XMM ssse3 +INNER_LOOPFILTER v, 5, 16 +INNER_LOOPFILTER h, 5 + ARCH_X86_32, 16 +INNER_LOOPFILTER v, 6, 8 +INNER_LOOPFILTER h, 6, 8 ;----------------------------------------------------------------------------- ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, ; int flimE, int flimI, int hev_thr); ;----------------------------------------------------------------------------- -%macro MBEDGE_LOOPFILTER 5 -%if %4 == 8 ; chroma -cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 +%macro MBEDGE_LOOPFILTER 3 +%if %3 == 8 ; chroma +cglobal vp8_%1_loop_filter8uv_mbedge, 6, %2, 15 %define dst8_reg r1 %define mstride_reg r2 %define E_reg r3 %define I_reg r4 %define hev_thr_reg r5 %else ; luma -cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 +cglobal vp8_%1_loop_filter16y_mbedge, 5, %2, 15 %define mstride_reg r1 %define E_reg r2 %define I_reg r3 @@ -2199,14 +2197,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %define stack_reg hev_thr_reg %endif -%define ssse3_or_higher 0 -%ifnidn %1, sse2 -%if mmsize == 16 -%define ssse3_or_higher 1 -%endif -%endif - -%if ssse3_or_higher +%if cpuflag(ssse3) pxor m7, m7 %endif @@ -2267,14 +2258,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh %endif -%if mmsize == 8 && %4 == 16 ; mmx/mmxext +%if mmsize == 8 && %3 == 16 ; mmx/mmxext mov cnt_reg, 2 %endif mov stride_reg, mstride_reg neg mstride_reg -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*4-4] -%if %4 == 8 +%if %3 == 8 lea dst8_reg, [dst8_reg+ stride_reg*4-4] %endif %endif @@ -2284,8 +2275,8 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endif ; read lea dst2_reg, [dst_reg + stride_reg] -%ifidn %2, v -%if %4 == 8 && mmsize == 16 +%ifidn %1, v +%if %3 == 8 && mmsize == 16 %define movrow movh %else %define movrow mova @@ -2296,7 +2287,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 movrow m5, [dst2_reg] ; q1 movrow m6, [dst2_reg+ stride_reg] ; q2 movrow m7, [dst2_reg+ stride_reg*2] ; q3 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m0, [dst8_reg+mstride_reg*4] movhps m2, [dst8_reg+mstride_reg*2] add dst8_reg, stride_reg @@ -2333,7 +2324,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 SWAP 6, 3 SWAP 5, 3 %else ; sse2 (h) -%if %4 == 16 +%if %3 == 16 lea dst8_reg, [dst_reg + stride_reg*8] %endif @@ -2422,7 +2413,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m6, m5 ; q2-q1 por m6, m4 ; abs(q2-q1) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m4, flim_I pxor m3, m3 psubusb m0, m4 @@ -2444,9 +2435,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 ; normal_limit and high_edge_variance for p1-p0, q1-q0 SWAP 7, 3 ; now m7 is zero -%ifidn %2, v +%ifidn %1, v movrow m3, [dst_reg +mstride_reg] ; p0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m3, [dst8_reg+mstride_reg] %endif %elifdef m12 @@ -2462,7 +2453,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m1, m3 ; p1-p0 psubusb m6, m2 ; p0-p1 por m1, m6 ; abs(p1-p0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m6, m1 psubusb m1, m4 psubusb m6, hev_thr @@ -2476,9 +2467,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endif SWAP 6, 4 ; now m6 is I -%ifidn %2, v +%ifidn %1, v movrow m4, [dst_reg] ; q0 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 movhps m4, [dst8_reg] %endif %elifdef m8 @@ -2493,7 +2484,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m1, m5 ; q0-q1 psubusb m7, m4 ; q1-q0 por m1, m7 ; abs(q1-q0) -%ifidn %1, mmx +%if notcpuflag(mmx2) mova m7, m1 psubusb m1, m6 psubusb m7, hev_thr @@ -2605,7 +2596,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 paddusb m4, m1 ; q0-f1 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) -%if ssse3_or_higher +%if cpuflag(ssse3) mova m7, [pb_1] %else mova m7, [pw_63] @@ -2618,7 +2609,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 pxor m0, m0 mova m6, m1 pcmpgtb m0, m1 ; which are negative -%if ssse3_or_higher +%if cpuflag(ssse3) punpcklbw m6, m7 ; interleave with "1" for rounding punpckhbw m1, m7 %else @@ -2626,7 +2617,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 punpckhbw m1, m0 %endif mova lim_sign, m0 -%if ssse3_or_higher +%if cpuflag(ssse3) mova m7, [pb_27_63] %ifndef m8 mova lim_res, m1 @@ -2659,7 +2650,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubb m1, m6 pand m1, m0 ; -a0 pandn m0, m6 ; +a0 -%if ssse3_or_higher +%if cpuflag(ssse3) mova m6, [pb_18_63] ; pipelining %endif psubusb m3, m1 @@ -2667,7 +2658,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 paddusb m3, m0 ; p0+a0 psubusb m4, m0 ; q0-a0 -%if ssse3_or_higher +%if cpuflag(ssse3) SWAP 6, 7 %ifdef m10 SWAP 1, 10 @@ -2699,7 +2690,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubb m1, m6 pand m1, m0 ; -a1 pandn m0, m6 ; +a1 -%if ssse3_or_higher +%if cpuflag(ssse3) mova m6, [pb_9_63] %endif psubusb m2, m1 @@ -2707,7 +2698,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 paddusb m2, m0 ; p1+a1 psubusb m5, m0 ; q1-a1 -%if ssse3_or_higher +%if cpuflag(ssse3) SWAP 6, 7 %ifdef m10 SWAP 1, 10 @@ -2757,14 +2748,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 psubusb m6, m7 ; q1-a1 ; store -%ifidn %2, v +%ifidn %1, v movrow [dst2_reg+mstride_reg*4], m1 movrow [dst_reg +mstride_reg*2], m2 movrow [dst_reg +mstride_reg ], m3 movrow [dst_reg], m4 movrow [dst2_reg], m5 movrow [dst2_reg+ stride_reg ], m6 -%if mmsize == 16 && %4 == 8 +%if mmsize == 16 && %3 == 8 add dst8_reg, mstride_reg movhps [dst8_reg+mstride_reg*2], m1 movhps [dst8_reg+mstride_reg ], m2 @@ -2788,14 +2779,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg %else ; sse2 (h) lea dst8_reg, [dst8_reg+mstride_reg+1] - WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 + WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %3 lea dst_reg, [dst2_reg+mstride_reg+4] lea dst8_reg, [dst8_reg+mstride_reg+4] -%ifidn %1, sse4 +%if cpuflag(sse4) add dst2_reg, 4 %endif WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg -%ifidn %1, sse4 +%if cpuflag(sse4) lea dst2_reg, [dst8_reg+ stride_reg] %endif WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg @@ -2803,15 +2794,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endif %if mmsize == 8 -%if %4 == 8 ; chroma -%ifidn %2, h +%if %3 == 8 ; chroma +%ifidn %1, h sub dst_reg, 5 %endif cmp dst_reg, dst8_reg mov dst_reg, dst8_reg jnz .next8px %else -%ifidn %2, h +%ifidn %1, h lea dst_reg, [dst_reg + stride_reg*8-5] %else ; v add dst_reg, 8 @@ -2828,46 +2819,31 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 %endmacro %if ARCH_X86_32 -INIT_MMX -%define SPLATB_REG SPLATB_REG_MMX -MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 -MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 -MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 -MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 - -%define SPLATB_REG SPLATB_REG_MMXEXT -MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 -MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 -MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 -MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 -%endif - -INIT_XMM -%define SPLATB_REG SPLATB_REG_SSE2 -%define WRITE_8W WRITE_8W_SSE2 -MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 -%ifdef m8 -MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 -%else -MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 -%endif -MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 -MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 - -%define SPLATB_REG SPLATB_REG_SSSE3 -MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 -%ifdef m8 -MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 -%else -MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 -%endif -MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 -MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 - -%define WRITE_8W WRITE_8W_SSE4 -%ifdef m8 -MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 -%else -MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 -%endif -MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 +INIT_MMX mmx +MBEDGE_LOOPFILTER v, 6, 16 +MBEDGE_LOOPFILTER h, 6, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 + +INIT_MMX mmx2 +MBEDGE_LOOPFILTER v, 6, 16 +MBEDGE_LOOPFILTER h, 6, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 +%endif + +INIT_XMM sse2 +MBEDGE_LOOPFILTER v, 5, 16 +MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 + +INIT_XMM ssse3 +MBEDGE_LOOPFILTER v, 5, 16 +MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 +MBEDGE_LOOPFILTER v, 6, 8 +MBEDGE_LOOPFILTER h, 6, 8 + +INIT_XMM sse4 +MBEDGE_LOOPFILTER h, 5 + ARCH_X86_32, 16 +MBEDGE_LOOPFILTER h, 6, 8 |