diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2012-03-03 06:46:29 -0800 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2012-03-03 20:40:00 -0800 |
commit | 21ffc78fd75624626be297cee515b1f11a755ccf (patch) | |
tree | 2d184813835c585549472a0326540bbe3b4cd0b7 /libavcodec/x86 | |
parent | 28170f1a39236c5be91ab6df67e477a213c552b4 (diff) | |
download | ffmpeg-21ffc78fd75624626be297cee515b1f11a755ccf.tar.gz |
vp8: convert mc x86 assembly to use named arguments.
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/vp8dsp.asm | 544 |
1 files changed, 272 insertions, 272 deletions
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 4dba6db3b7..2f45f5a478 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -116,23 +116,25 @@ bilinear_filter_vb_m: times 8 db 7, 1 times 8 db 1, 7 %ifdef PIC -%define fourtap_filter_hw r11 -%define sixtap_filter_hw r11 -%define fourtap_filter_hb r11 -%define sixtap_filter_hb r11 -%define fourtap_filter_v r11 -%define sixtap_filter_v r11 -%define bilinear_filter_vw r11 -%define bilinear_filter_vb r11 +%define fourtap_filter_hw picregq +%define sixtap_filter_hw picregq +%define fourtap_filter_hb picregq +%define sixtap_filter_hb picregq +%define fourtap_filter_v picregq +%define sixtap_filter_v picregq +%define bilinear_filter_vw picregq +%define bilinear_filter_vb picregq +%define npicregs 1 %else -%define fourtap_filter_hw fourtap_filter_hw_m -%define sixtap_filter_hw sixtap_filter_hw_m -%define fourtap_filter_hb fourtap_filter_hb_m -%define sixtap_filter_hb sixtap_filter_hb_m -%define fourtap_filter_v fourtap_filter_v_m -%define sixtap_filter_v sixtap_filter_v_m +%define fourtap_filter_hw fourtap_filter_hw_m +%define sixtap_filter_hw sixtap_filter_hw_m +%define fourtap_filter_hb fourtap_filter_hb_m +%define sixtap_filter_hb sixtap_filter_hb_m +%define fourtap_filter_v fourtap_filter_v_m +%define sixtap_filter_v sixtap_filter_v_m %define bilinear_filter_vw bilinear_filter_vw_m %define bilinear_filter_vb bilinear_filter_vb_m +%define npicregs 0 %endif filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 @@ -174,25 +176,25 @@ SECTION .text ;----------------------------------------------------------------------------- %macro FILTER_SSSE3 1 -cglobal put_vp8_epel%1_h6, 6, 6, 8 - lea r5d, [r5*3] +cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] mova m3, [filter_h6_shuf2] mova m4, [filter_h6_shuf3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes - mova m6, [sixtap_filter_hb+r5*8-32] - mova m7, [sixtap_filter_hb+r5*8-16] + mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes + mova m6, [sixtap_filter_hb+mxq*8-32] + mova m7, [sixtap_filter_hb+mxq*8-16] .nextrow - movu m0, [r2-2] + movu m0, [srcq-2] mova m1, m0 mova m2, m0 %if mmsize == 8 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the ; shuffle with a memory operand - punpcklbw m0, [r2+3] + punpcklbw m0, [srcq+3] %else pshufb m0, [filter_h6_shuf1] %endif @@ -206,28 +208,28 @@ cglobal put_vp8_epel%1_h6, 6, 6, 8 paddsw m0, [pw_64] psraw m0, 7 packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_h4, 6, 6, 7 - shl r5d, 4 +cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 mova m2, [pw_64] mova m3, [filter_h2_shuf] mova m4, [filter_h4_shuf] %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes - mova m6, [fourtap_filter_hb+r5] + mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes + mova m6, [fourtap_filter_hb+mxq] .nextrow - movu m0, [r2-1] + movu m0, [srcq-1] mova m1, m0 pshufb m0, m3 pshufb m1, m4 @@ -237,33 +239,33 @@ cglobal put_vp8_epel%1_h4, 6, 6, 7 paddsw m0, m1 psraw m0, 7 packuswb m0, m0 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v4, 7, 7, 8 - shl r6d, 4 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [fourtap_filter_hb_m] + lea picregq, [fourtap_filter_hb_m] %endif - mova m5, [fourtap_filter_hb+r6-16] - mova m6, [fourtap_filter_hb+r6] + mova m5, [fourtap_filter_hb+myq-16] + mova m6, [fourtap_filter_hb+myq] mova m7, [pw_64] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq .nextrow - movh m3, [r2+2*r3] ; read new row + movh m3, [srcq+2*srcstrideq] ; read new row mova m4, m0 mova m0, m1 punpcklbw m4, m1 @@ -276,44 +278,44 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8 paddsw m4, m7 psraw m4, 7 packuswb m4, m4 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET -cglobal put_vp8_epel%1_v6, 7, 7, 8 - lea r6d, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + lea myd, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_hb_m] + lea picregq, [sixtap_filter_hb_m] %endif - lea r6, [sixtap_filter_hb+r6*8] + lea myq, [sixtap_filter_hb+myq*8] ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] .nextrow - movh m5, [r2+2*r3] ; read new row + movh m5, [srcq+2*srcstrideq] ; read new row mova m6, m0 punpcklbw m6, m5 mova m0, m1 punpcklbw m1, m2 mova m7, m3 punpcklbw m7, m4 - pmaddubsw m6, [r6-48] - pmaddubsw m1, [r6-32] - pmaddubsw m7, [r6-16] + pmaddubsw m6, [myq-48] + pmaddubsw m1, [myq-32] + pmaddubsw m7, [myq-16] paddsw m6, m1 paddsw m6, m7 mova m1, m2 @@ -323,12 +325,12 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8 mova m3, m4 packuswb m6, m6 mova m4, m5 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro @@ -340,18 +342,18 @@ FILTER_SSSE3 8 ; 4x4 block, H-only 4-tap filter INIT_MMX mmx2 -cglobal put_vp8_epel4_h4, 6, 6 - shl r5d, 4 +cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [fourtap_filter_hw_m] + lea picregq, [fourtap_filter_hw_m] %endif - movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words - movq mm5, [fourtap_filter_hw+r5] + movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words + movq mm5, [fourtap_filter_hw+mxq] movq mm7, [pw_64] pxor mm6, mm6 .nextrow - movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels + movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -377,30 +379,30 @@ cglobal put_vp8_epel4_h4, 6, 6 paddsw mm3, mm7 ; rounding psraw mm3, 7 packuswb mm3, mm6 ; clip and word->bytes - movd [r0], mm3 ; store + movd [dstq], mm3 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, H-only 6-tap filter INIT_MMX mmx2 -cglobal put_vp8_epel4_h6, 6, 6 - lea r5d, [r5*3] +cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] %ifdef PIC - lea r11, [sixtap_filter_hw_m] + lea picregq, [sixtap_filter_hw_m] %endif - movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words - movq mm5, [sixtap_filter_hw+r5*8-32] - movq mm6, [sixtap_filter_hw+r5*8-16] + movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words + movq mm5, [sixtap_filter_hw+mxq*8-32] + movq mm6, [sixtap_filter_hw+mxq*8-16] movq mm7, [pw_64] pxor mm3, mm3 .nextrow - movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels + movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels ; first set of 2 pixels movq mm2, mm1 ; byte ABCD.. @@ -420,7 +422,7 @@ cglobal put_vp8_epel4_h6, 6, 6 paddd mm1, mm2 ; finish 1st 2px ; second set of 2 pixels, use backup of above - movd mm2, [r2+3] ; byte FGHI (prevent overreads) + movd mm2, [srcq+3] ; byte FGHI (prevent overreads) pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 paddd mm0, mm3 ; add to 2nd 2px cache @@ -435,35 +437,35 @@ cglobal put_vp8_epel4_h6, 6, 6 paddsw mm1, mm7 ; rounding psraw mm1, 7 packuswb mm1, mm3 ; clip and word->bytes - movd [r0], mm1 ; store + movd [dstq], mm1 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET INIT_XMM sse2 -cglobal put_vp8_epel8_h4, 6, 6, 10 - shl r5d, 5 +cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 5 %ifdef PIC - lea r11, [fourtap_filter_v_m] + lea picregq, [fourtap_filter_v_m] %endif - lea r5, [fourtap_filter_v+r5-32] + lea mxq, [fourtap_filter_v+mxq-32] pxor m7, m7 mova m4, [pw_64] - mova m5, [r5+ 0] - mova m6, [r5+16] + mova m5, [mxq+ 0] + mova m6, [mxq+16] %ifdef m8 - mova m8, [r5+32] - mova m9, [r5+48] + mova m8, [mxq+32] + mova m9, [mxq+48] %endif .nextrow - movq m0, [r2-1] - movq m1, [r2-0] - movq m2, [r2+1] - movq m3, [r2+2] + movq m0, [srcq-1] + movq m1, [srcq-0] + movq m2, [srcq+1] + movq m3, [srcq+2] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -474,8 +476,8 @@ cglobal put_vp8_epel8_h4, 6, 6, 10 pmullw m2, m8 pmullw m3, m9 %else - pmullw m2, [r5+32] - pmullw m3, [r5+48] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] %endif paddsw m0, m1 paddsw m2, m3 @@ -483,40 +485,40 @@ cglobal put_vp8_epel8_h4, 6, 6, 10 paddsw m0, m4 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET INIT_XMM sse2 -cglobal put_vp8_epel8_h6, 6, 6, 14 - lea r5d, [r5*3] - shl r5d, 4 +cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg + lea mxd, [mxq*3] + shl mxd, 4 %ifdef PIC - lea r11, [sixtap_filter_v_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r5, [sixtap_filter_v+r5-96] + lea mxq, [sixtap_filter_v+mxq-96] pxor m7, m7 mova m6, [pw_64] %ifdef m8 - mova m8, [r5+ 0] - mova m9, [r5+16] - mova m10, [r5+32] - mova m11, [r5+48] - mova m12, [r5+64] - mova m13, [r5+80] + mova m8, [mxq+ 0] + mova m9, [mxq+16] + mova m10, [mxq+32] + mova m11, [mxq+48] + mova m12, [mxq+64] + mova m13, [mxq+80] %endif .nextrow - movq m0, [r2-2] - movq m1, [r2-1] - movq m2, [r2-0] - movq m3, [r2+1] - movq m4, [r2+2] - movq m5, [r2+3] + movq m0, [srcq-2] + movq m1, [srcq-1] + movq m2, [srcq-0] + movq m3, [srcq+1] + movq m4, [srcq+2] + movq m5, [srcq+3] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -531,12 +533,12 @@ cglobal put_vp8_epel8_h6, 6, 6, 14 pmullw m4, m12 pmullw m5, m13 %else - pmullw m0, [r5+ 0] - pmullw m1, [r5+16] - pmullw m2, [r5+32] - pmullw m3, [r5+48] - pmullw m4, [r5+64] - pmullw m5, [r5+80] + pmullw m0, [mxq+ 0] + pmullw m1, [mxq+16] + pmullw m2, [mxq+32] + pmullw m3, [mxq+48] + pmullw m4, [mxq+64] + pmullw m5, [mxq+80] %endif paddsw m1, m4 paddsw m0, m5 @@ -546,52 +548,52 @@ cglobal put_vp8_epel8_h6, 6, 6, 14 paddsw m0, m6 psraw m0, 7 packuswb m0, m7 - movh [r0], m0 ; store + movh [dstq], m0 ; store ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %macro FILTER_V 1 ; 4x4 block, V-only 4-tap filter -cglobal put_vp8_epel%1_v4, 7, 7, 8 - shl r6d, 5 +cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 5 %ifdef PIC - lea r11, [fourtap_filter_v_m] + lea picregq, [fourtap_filter_v_m] %endif - lea r6, [fourtap_filter_v+r6-32] + lea myq, [fourtap_filter_v+myq-32] mova m6, [pw_64] pxor m7, m7 - mova m5, [r6+48] + mova m5, [myq+48] ; read 3 lines - sub r2, r3 - movh m0, [r2] - movh m1, [r2+ r3] - movh m2, [r2+2*r3] - add r2, r3 + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+ srcstrideq] + movh m2, [srcq+2*srcstrideq] + add srcq, srcstrideq punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 .nextrow ; first calculate negative taps (to prevent losing positive overflows) - movh m4, [r2+2*r3] ; read new row + movh m4, [srcq+2*srcstrideq] ; read new row punpcklbw m4, m7 mova m3, m4 - pmullw m0, [r6+0] + pmullw m0, [myq+0] pmullw m4, m5 paddsw m4, m0 ; then calculate positive taps mova m0, m1 - pmullw m1, [r6+16] + pmullw m1, [myq+16] paddsw m4, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m4, m2 mova m2, m3 @@ -599,36 +601,36 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8 paddsw m4, m6 psraw m4, 7 packuswb m4, m7 - movh [r0], m4 + movh [dstq], m4 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET ; 4x4 block, V-only 6-tap filter -cglobal put_vp8_epel%1_v6, 7, 7, 8 - shl r6d, 4 - lea r6, [r6*3] +cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 + lea myq, [myq*3] %ifdef PIC - lea r11, [sixtap_filter_v_m] + lea picregq, [sixtap_filter_v_m] %endif - lea r6, [sixtap_filter_v+r6-96] + lea myq, [sixtap_filter_v+myq-96] pxor m7, m7 ; read 5 lines - sub r2, r3 - sub r2, r3 - movh m0, [r2] - movh m1, [r2+r3] - movh m2, [r2+r3*2] - lea r2, [r2+r3*2] - add r2, r3 - movh m3, [r2] - movh m4, [r2+r3] + sub srcq, srcstrideq + sub srcq, srcstrideq + movh m0, [srcq] + movh m1, [srcq+srcstrideq] + movh m2, [srcq+srcstrideq*2] + lea srcq, [srcq+srcstrideq*2] + add srcq, srcstrideq + movh m3, [srcq] + movh m4, [srcq+srcstrideq] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 @@ -638,38 +640,38 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8 .nextrow ; first calculate negative taps (to prevent losing positive overflows) mova m5, m1 - pmullw m5, [r6+16] + pmullw m5, [myq+16] mova m6, m4 - pmullw m6, [r6+64] + pmullw m6, [myq+64] paddsw m6, m5 ; then calculate positive taps - movh m5, [r2+2*r3] ; read new row + movh m5, [srcq+2*srcstrideq] ; read new row punpcklbw m5, m7 - pmullw m0, [r6+0] + pmullw m0, [myq+0] paddsw m6, m0 mova m0, m1 mova m1, m2 - pmullw m2, [r6+32] + pmullw m2, [myq+32] paddsw m6, m2 mova m2, m3 - pmullw m3, [r6+48] + pmullw m3, [myq+48] paddsw m6, m3 mova m3, m4 mova m4, m5 - pmullw m5, [r6+80] + pmullw m5, [myq+80] paddsw m6, m5 ; round/clip/store paddsw m6, [pw_64] psraw m6, 7 packuswb m6, m7 - movh [r0], m6 + movh [dstq], m6 ; go to next line - add r0, r1 - add r2, r3 - dec r4d ; next row + add dstq, dststrideq + add srcq, srcstrideq + dec heightd ; next row jg .nextrow REP_RET %endmacro @@ -680,20 +682,19 @@ INIT_XMM sse2 FILTER_V 8 %macro FILTER_BILINEAR 1 -cglobal put_vp8_bilinear%1_v, 7, 7, 7 - mov r5d, 8*16 - shl r6d, 4 - sub r5d, r6d +cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r5-16] - mova m5, [bilinear_filter_vw+r6-16] + mova m5, [bilinear_filter_vw+myq-1*16] + neg myq + mova m4, [bilinear_filter_vw+myq+7*16] .nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m3, [r2+r3*2] + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m3, [srcq+srcstrideq*2] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m3, m6 @@ -711,35 +712,34 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7 %if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%1_h, 7, 7, 7 - mov r6d, 8*16 - shl r5d, 4 - sub r6d, r5d +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vw_m] + lea picregq, [bilinear_filter_vw_m] %endif pxor m6, m6 - mova m4, [bilinear_filter_vw+r6-16] - mova m5, [bilinear_filter_vw+r5-16] + mova m5, [bilinear_filter_vw+mxq-1*16] + neg mxq + mova m4, [bilinear_filter_vw+mxq+7*16] .nextrow - movh m0, [r2+r3*0+0] - movh m1, [r2+r3*0+1] - movh m2, [r2+r3*1+0] - movh m3, [r2+r3*1+1] + movh m0, [srcq+srcstrideq*0+0] + movh m1, [srcq+srcstrideq*0+1] + movh m2, [srcq+srcstrideq*1+0] + movh m3, [srcq+srcstrideq*1+1] punpcklbw m0, m6 punpcklbw m1, m6 punpcklbw m2, m6 @@ -757,17 +757,17 @@ cglobal put_vp8_bilinear%1_h, 7, 7, 7 %if mmsize == 8 packuswb m0, m0 packuswb m2, m2 - movh [r0+r1*0], m0 - movh [r0+r1*1], m2 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m2 %else packuswb m0, m2 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro @@ -778,17 +778,17 @@ INIT_XMM sse2 FILTER_BILINEAR 8 %macro FILTER_BILINEAR_SSSE3 1 -cglobal put_vp8_bilinear%1_v, 7, 7, 5 - shl r6d, 4 +cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my + shl myd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 - mova m3, [bilinear_filter_vb+r6-16] + mova m3, [bilinear_filter_vb+myq-16] .nextrow - movh m0, [r2+r3*0] - movh m1, [r2+r3*1] - movh m2, [r2+r3*2] + movh m0, [srcq+srcstrideq*0] + movh m1, [srcq+srcstrideq*1] + movh m2, [srcq+srcstrideq*2] punpcklbw m0, m1 punpcklbw m1, m2 pmaddubsw m0, m3 @@ -800,31 +800,31 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET -cglobal put_vp8_bilinear%1_h, 7, 7, 5 - shl r5d, 4 +cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg + shl mxd, 4 %ifdef PIC - lea r11, [bilinear_filter_vb_m] + lea picregq, [bilinear_filter_vb_m] %endif pxor m4, m4 mova m2, [filter_h2_shuf] - mova m3, [bilinear_filter_vb+r5-16] + mova m3, [bilinear_filter_vb+mxq-16] .nextrow - movu m0, [r2+r3*0] - movu m1, [r2+r3*1] + movu m0, [srcq+srcstrideq*0] + movu m1, [srcq+srcstrideq*1] pshufb m0, m2 pshufb m1, m2 pmaddubsw m0, m3 @@ -836,17 +836,17 @@ cglobal put_vp8_bilinear%1_h, 7, 7, 5 %if mmsize==8 packuswb m0, m0 packuswb m1, m1 - movh [r0+r1*0], m0 - movh [r0+r1*1], m1 + movh [dstq+dststrideq*0], m0 + movh [dstq+dststrideq*1], m1 %else packuswb m0, m1 - movh [r0+r1*0], m0 - movhps [r0+r1*1], m0 + movh [dstq+dststrideq*0], m0 + movhps [dstq+dststrideq*1], m0 %endif - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - sub r4d, 2 + lea dstq, [dstq+dststrideq*2] + lea srcq, [srcq+srcstrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endmacro @@ -857,47 +857,47 @@ INIT_XMM ssse3 FILTER_BILINEAR_SSSE3 8 INIT_MMX mmx -cglobal put_vp8_pixels8, 5,5 +cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0] - movq mm1, [r2+r3*1] - lea r2, [r2+r3*2] - movq [r0+r1*0], mm0 - movq [r0+r1*1], mm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0] + movq mm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0], mm0 + movq [dstq+dststrideq*1], mm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET %if ARCH_X86_32 INIT_MMX mmx -cglobal put_vp8_pixels16, 5,5 +cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height .nextrow: - movq mm0, [r2+r3*0+0] - movq mm1, [r2+r3*0+8] - movq mm2, [r2+r3*1+0] - movq mm3, [r2+r3*1+8] - lea r2, [r2+r3*2] - movq [r0+r1*0+0], mm0 - movq [r0+r1*0+8], mm1 - movq [r0+r1*1+0], mm2 - movq [r0+r1*1+8], mm3 - lea r0, [r0+r1*2] - sub r4d, 2 + movq mm0, [srcq+srcstrideq*0+0] + movq mm1, [srcq+srcstrideq*0+8] + movq mm2, [srcq+srcstrideq*1+0] + movq mm3, [srcq+srcstrideq*1+8] + lea srcq, [srcq+srcstrideq*2] + movq [dstq+dststrideq*0+0], mm0 + movq [dstq+dststrideq*0+8], mm1 + movq [dstq+dststrideq*1+0], mm2 + movq [dstq+dststrideq*1+8], mm3 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET %endif INIT_XMM sse -cglobal put_vp8_pixels16, 5,5,2 +cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height .nextrow: - movups xmm0, [r2+r3*0] - movups xmm1, [r2+r3*1] - lea r2, [r2+r3*2] - movaps [r0+r1*0], xmm0 - movaps [r0+r1*1], xmm1 - lea r0, [r0+r1*2] - sub r4d, 2 + movups xmm0, [srcq+srcstrideq*0] + movups xmm1, [srcq+srcstrideq*1] + lea srcq, [srcq+srcstrideq*2] + movaps [dstq+dststrideq*0], xmm0 + movaps [dstq+dststrideq*1], xmm1 + lea dstq, [dstq+dststrideq*2] + sub heightd, 2 jg .nextrow REP_RET |