diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-05-22 23:47:06 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-24 15:15:56 +0200 |
commit | 81aa0f4604f98da692f2689c84968f90354a92ea (patch) | |
tree | 243197f645e40a7dd5219df7992e4f8452fc73a9 /libavcodec/x86/hpeldsp.asm | |
parent | 726316240bcc41cef6053dd6d1e46a3c57328498 (diff) | |
download | ffmpeg-81aa0f4604f98da692f2689c84968f90354a92ea.tar.gz |
x86: hpeldsp: implement SSSE3 version of _xy2
Loading pb_1 rather than pw_8192 was benchmarked to be more efficient.
Loading of the 2 yields no advantage. Loading of one saves ~11 cycles.
decicycles count:
put8: 3223(mmx) -> 2387
avg8: 2863(mmxext) -> 2125
put16: 4356(sse2) -> 3553
avg16: 4481(sse2) -> 3513
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86/hpeldsp.asm')
-rw-r--r-- | libavcodec/x86/hpeldsp.asm | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 4af423aee5..76e4632cbc 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -30,6 +30,9 @@ SECTION_RODATA cextern pb_1 cextern pw_2 +pw_8192: times 8 dw (1<<13) +pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 SECTION_TEXT @@ -635,3 +638,70 @@ SET_PIXELS_XY2 avg INIT_XMM sse2 SET_PIXELS_XY2 put SET_PIXELS_XY2 avg + +%macro SSSE3_PIXELS_XY2 1-2 +%if %0 == 2 ; sse2 +cglobal %1_pixels16_xy2, 4,5,%2 + mova m4, [pb_interleave16] +%else +cglobal %1_pixels8_xy2, 4,5 + mova m4, [pb_interleave8] +%endif + mova m5, [pb_1] + movu m0, [r1] + movu m1, [r1+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + xor r4, r4 + add r1, r2 +.loop: + movu m2, [r1+r4] + movu m3, [r1+r4+1] + pmaddubsw m2, m5 + pmaddubsw m3, m5 + paddusw m0, m2 + paddusw m1, m3 + pmulhrsw m0, [pw_8192] + pmulhrsw m1, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m0, m1 + pshufb m0, m4 + pavgb m0, m6 +%else + packuswb m0, m1 + pshufb m0, m4 +%endif + mova [r0+r4], m0 + add r4, r2 + + movu m0, [r1+r4] + movu m1, [r1+r4+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddusw m2, m0 + paddusw m3, m1 + pmulhrsw m2, [pw_8192] + pmulhrsw m3, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m2, m3 + pshufb m2, m4 + pavgb m2, m6 +%else + packuswb m2, m3 + pshufb m2, m4 +%endif + mova [r0+r4], m2 + add r4, r2 + sub r3d, 2 + jnz .loop + REP_RET +%endmacro + +INIT_MMX ssse3 +SSSE3_PIXELS_XY2 put +SSSE3_PIXELS_XY2 avg +INIT_XMM ssse3 +SSSE3_PIXELS_XY2 put, 6 +SSSE3_PIXELS_XY2 avg, 7 |