aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/vp8dsp.asm
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2010-07-02 05:27:41 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2010-07-02 05:27:41 +0000
commitdcc602d8022aac4d419038d6020e07584011aa8a (patch)
treebe5dc8007fbe6f763317f5b7fa6014e63ae82a7a /libavcodec/x86/vp8dsp.asm
parent301ab19dd97deb30a073e93d3c8a969bb7f6966f (diff)
downloadffmpeg-dcc602d8022aac4d419038d6020e07584011aa8a.tar.gz
SSSE3 versions of width4 VP8 6-tap MC functions
Also make some small changes to saturation order of 4-tap SSSE3 MC to fix a non-bitexactness bug. Patch mostly by Eli Friedman <eli.friedman AT gmail DOT com>. Originally committed as revision 23965 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/vp8dsp.asm')
-rw-r--r--libavcodec/x86/vp8dsp.asm335
1 files changed, 174 insertions, 161 deletions
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 37b127bb4c..3ac9ca9505 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -44,14 +44,14 @@ sixtap_filter_hw_m: times 4 dw 2, -11
times 4 dw 36, 108
times 4 dw -11, 2
-fourtap_filter_hb_m: times 8 db -6, -1
- times 8 db 123, 12
- times 8 db -9, -6
- times 8 db 93, 50
- times 8 db -6, -9
- times 8 db 50, 93
- times 8 db -1, -6
- times 8 db 12, 123
+fourtap_filter_hb_m: times 8 db -6, 123
+ times 8 db 12, -1
+ times 8 db -9, 93
+ times 8 db 50, -6
+ times 8 db -6, 50
+ times 8 db 93, -9
+ times 8 db -1, 12
+ times 8 db 123, -6
sixtap_filter_hb_m: times 8 db 2, 1
times 8 db -11, 108
@@ -136,7 +136,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
%endif
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
+filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
@@ -159,6 +159,171 @@ SECTION .text
; int height, int mx, int my);
;-----------------------------------------------------------------------------
+%macro FILTER_SSSE3 3
+cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
+ lea r5d, [r5*3]
+ mova m3, [filter_h6_shuf2]
+ mova m4, [filter_h6_shuf3]
+%ifdef PIC
+ lea r11, [sixtap_filter_hb_m]
+%endif
+ mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
+ mova m6, [sixtap_filter_hb+r5*8-32]
+ mova m7, [sixtap_filter_hb+r5*8-16]
+
+.nextrow
+ movu m0, [r2-2]
+ mova m1, m0
+ mova m2, m0
+%ifidn %1, 4
+; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
+; shuffle with a memory operand
+ punpcklbw m0, [r2+3]
+%else
+ pshufb m0, [filter_h6_shuf1]
+%endif
+ pshufb m1, m3
+ pshufb m2, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ pmaddubsw m2, m7
+ paddsw m0, m1
+ paddsw m0, m2
+ paddsw m0, [pw_64]
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0 ; store
+
+ ; go to next line
+ add r0, r1
+ add r2, r3
+ dec r4 ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
+ shl r5d, 4
+ mova m2, [pw_64]
+ mova m3, [filter_h2_shuf]
+ mova m4, [filter_h4_shuf]
+%ifdef PIC
+ lea r11, [fourtap_filter_hb_m]
+%endif
+ mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
+ mova m6, [fourtap_filter_hb+r5]
+
+.nextrow
+ movu m0, [r2-1]
+ mova m1, m0
+ pshufb m0, m3
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m6
+ paddsw m0, m2
+ paddsw m0, m1
+ psraw m0, 7
+ packuswb m0, m0
+ movh [r0], m0 ; store
+
+ ; go to next line
+ add r0, r1
+ add r2, r3
+ dec r4 ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
+ shl r6d, 4
+%ifdef PIC
+ lea r11, [fourtap_filter_hb_m]
+%endif
+ mova m5, [fourtap_filter_hb+r6-16]
+ mova m6, [fourtap_filter_hb+r6]
+ mova m7, [pw_64]
+
+ ; read 3 lines
+ sub r2, r3
+ movh m0, [r2]
+ movh m1, [r2+ r3]
+ movh m2, [r2+2*r3]
+ add r2, r3
+
+.nextrow
+ movh m3, [r2+2*r3] ; read new row
+ mova m4, m0
+ mova m0, m1
+ punpcklbw m4, m1
+ mova m1, m2
+ punpcklbw m2, m3
+ pmaddubsw m4, m5
+ pmaddubsw m2, m6
+ paddsw m4, m2
+ mova m2, m3
+ paddsw m4, m7
+ psraw m4, 7
+ packuswb m4, m4
+ movh [r0], m4
+
+ ; go to next line
+ add r0, r1
+ add r2, r3
+ dec r4 ; next row
+ jg .nextrow
+ REP_RET
+
+cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
+ lea r6d, [r6*3]
+%ifdef PIC
+ lea r11, [sixtap_filter_hb_m]
+%endif
+ lea r6, [sixtap_filter_hb+r6*8]
+
+ ; read 5 lines
+ sub r2, r3
+ sub r2, r3
+ movh m0, [r2]
+ movh m1, [r2+r3]
+ movh m2, [r2+r3*2]
+ lea r2, [r2+r3*2]
+ add r2, r3
+ movh m3, [r2]
+ movh m4, [r2+r3]
+
+.nextrow
+ movh m5, [r2+2*r3] ; read new row
+ mova m6, m0
+ punpcklbw m6, m5
+ mova m0, m1
+ punpcklbw m1, m2
+ mova m7, m3
+ punpcklbw m7, m4
+ pmaddubsw m6, [r6-48]
+ pmaddubsw m1, [r6-32]
+ pmaddubsw m7, [r6-16]
+ paddsw m6, m1
+ paddsw m6, m7
+ mova m1, m2
+ paddsw m6, [pw_64]
+ mova m2, m3
+ psraw m6, 7
+ mova m3, m4
+ packuswb m6, m6
+ mova m4, m5
+ movh [r0], m6
+
+ ; go to next line
+ add r0, r1
+ add r2, r3
+ dec r4 ; next row
+ jg .nextrow
+ REP_RET
+%endmacro
+
+INIT_MMX
+FILTER_SSSE3 4, 0, 0
+INIT_XMM
+FILTER_SSSE3 8, 8, 7
+
; 4x4 block, H-only 4-tap filter
cglobal put_vp8_epel4_h4_mmxext, 6, 6
shl r5d, 4
@@ -383,72 +548,6 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
jg .nextrow
REP_RET
-cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
- shl r5d, 4
- mova m2, [pw_64]
- mova m3, [filter_h4_shuf]
- mova m4, [filter_h6_shuf2]
-%ifdef PIC
- lea r11, [fourtap_filter_hb_m]
-%endif
- mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
- mova m6, [fourtap_filter_hb+r5]
-
-.nextrow
- movu m0, [r2-1]
- mova m1, m0
- pshufb m0, m3
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m6
- paddsw m0, m2
- paddsw m0, m1
- psraw m0, 7
- packuswb m0, m0
- movh [r0], m0 ; store
-
- ; go to next line
- add r0, r1
- add r2, r3
- dec r4 ; next row
- jg .nextrow
- REP_RET
-
-cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
- lea r5d, [r5*3]
- mova m3, [filter_h6_shuf1]
- mova m4, [filter_h6_shuf2]
-%ifdef PIC
- lea r11, [sixtap_filter_hb_m]
-%endif
- mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
- mova m6, [sixtap_filter_hb+r5*8-32]
- mova m7, [sixtap_filter_hb+r5*8-16]
-
-.nextrow
- movu m0, [r2-2]
- mova m1, m0
- mova m2, m0
- pshufb m0, m3
- pshufb m1, m4
- pshufb m2, [filter_h6_shuf3]
- pmaddubsw m0, m5
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- paddsw m0, m1
- paddsw m0, m2
- paddsw m0, [pw_64]
- psraw m0, 7
- packuswb m0, m0
- movh [r0], m0 ; store
-
- ; go to next line
- add r0, r1
- add r2, r3
- dec r4 ; next row
- jg .nextrow
- REP_RET
-
%macro FILTER_V 3
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
@@ -573,92 +672,6 @@ FILTER_V mmxext, 4, 0
INIT_XMM
FILTER_V sse2, 8, 8
-cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
- shl r6d, 4
-%ifdef PIC
- lea r11, [fourtap_filter_hb_m]
-%endif
- mova m5, [fourtap_filter_hb+r6-16]
- mova m6, [fourtap_filter_hb+r6]
- mova m7, [pw_64]
-
- ; read 3 lines
- sub r2, r3
- movh m0, [r2]
- movh m1, [r2+ r3]
- movh m2, [r2+2*r3]
- add r2, r3
-
-.nextrow
- movh m3, [r2+2*r3] ; read new row
- mova m4, m0
- mova m0, m1
- punpcklbw m4, m3
- punpcklbw m1, m2
- pmaddubsw m4, m5
- pmaddubsw m1, m6
- paddsw m4, m1
- mova m1, m2
- paddsw m4, m7
- mova m2, m3
- psraw m4, 7
- packuswb m4, m4
- movh [r0], m4
-
- ; go to next line
- add r0, r1
- add r2, r3
- dec r4 ; next row
- jg .nextrow
- REP_RET
-
-cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
- lea r6d, [r6*3]
-%ifdef PIC
- lea r11, [sixtap_filter_hb_m]
-%endif
- lea r6, [sixtap_filter_hb+r6*8]
-
- ; read 5 lines
- sub r2, r3
- sub r2, r3
- movh m0, [r2]
- movh m1, [r2+r3]
- movh m2, [r2+r3*2]
- lea r2, [r2+r3*2]
- add r2, r3
- movh m3, [r2]
- movh m4, [r2+r3]
-
-.nextrow
- movh m5, [r2+2*r3] ; read new row
- mova m6, m0
- punpcklbw m6, m5
- mova m0, m1
- punpcklbw m1, m2
- mova m7, m3
- punpcklbw m7, m4
- pmaddubsw m6, [r6-48]
- pmaddubsw m1, [r6-32]
- pmaddubsw m7, [r6-16]
- paddsw m6, m1
- paddsw m6, m7
- mova m1, m2
- paddsw m6, [pw_64]
- mova m2, m3
- psraw m6, 7
- mova m3, m4
- packuswb m6, m6
- mova m4, m5
- movh [r0], m6
-
- ; go to next line
- add r0, r1
- add r2, r3
- dec r4 ; next row
- jg .nextrow
- REP_RET
-
%macro FILTER_BILINEAR 3
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
mov r5d, 8*16