diff options
author | Christophe Gisquet <christophe.gisquet@gmail.com> | 2014-05-22 23:47:06 +0200 |
---|---|---|
committer | Michael Niedermayer <michaelni@gmx.at> | 2014-05-24 15:15:56 +0200 |
commit | 81aa0f4604f98da692f2689c84968f90354a92ea (patch) | |
tree | 243197f645e40a7dd5219df7992e4f8452fc73a9 /libavcodec/x86 | |
parent | 726316240bcc41cef6053dd6d1e46a3c57328498 (diff) | |
download | ffmpeg-81aa0f4604f98da692f2689c84968f90354a92ea.tar.gz |
x86: hpeldsp: implement SSSE3 version of _xy2
Loading pb_1 rather than pw_8192 was benchmarked to be more efficient.
Loading of the 2 yields no advantage. Loading of one saves ~11 cycles.
decicycles count:
put8: 3223(mmx) -> 2387
avg8: 2863(mmxext) -> 2125
put16: 4356(sse2) -> 3553
avg16: 4481(sse2) -> 3513
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/hpeldsp.asm | 70 | ||||
-rw-r--r-- | libavcodec/x86/hpeldsp_init.c | 22 |
2 files changed, 92 insertions, 0 deletions
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm index 4af423aee5..76e4632cbc 100644 --- a/libavcodec/x86/hpeldsp.asm +++ b/libavcodec/x86/hpeldsp.asm @@ -30,6 +30,9 @@ SECTION_RODATA cextern pb_1 cextern pw_2 +pw_8192: times 8 dw (1<<13) +pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 SECTION_TEXT @@ -635,3 +638,70 @@ SET_PIXELS_XY2 avg INIT_XMM sse2 SET_PIXELS_XY2 put SET_PIXELS_XY2 avg + +%macro SSSE3_PIXELS_XY2 1-2 +%if %0 == 2 ; sse2 +cglobal %1_pixels16_xy2, 4,5,%2 + mova m4, [pb_interleave16] +%else +cglobal %1_pixels8_xy2, 4,5 + mova m4, [pb_interleave8] +%endif + mova m5, [pb_1] + movu m0, [r1] + movu m1, [r1+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + xor r4, r4 + add r1, r2 +.loop: + movu m2, [r1+r4] + movu m3, [r1+r4+1] + pmaddubsw m2, m5 + pmaddubsw m3, m5 + paddusw m0, m2 + paddusw m1, m3 + pmulhrsw m0, [pw_8192] + pmulhrsw m1, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m0, m1 + pshufb m0, m4 + pavgb m0, m6 +%else + packuswb m0, m1 + pshufb m0, m4 +%endif + mova [r0+r4], m0 + add r4, r2 + + movu m0, [r1+r4] + movu m1, [r1+r4+1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + paddusw m2, m0 + paddusw m3, m1 + pmulhrsw m2, [pw_8192] + pmulhrsw m3, [pw_8192] +%ifidn %1, avg + mova m6, [r0+r4] + packuswb m2, m3 + pshufb m2, m4 + pavgb m2, m6 +%else + packuswb m2, m3 + pshufb m2, m4 +%endif + mova [r0+r4], m2 + add r4, r2 + sub r3d, 2 + jnz .loop + REP_RET +%endmacro + +INIT_MMX ssse3 +SSSE3_PIXELS_XY2 put +SSSE3_PIXELS_XY2 avg +INIT_XMM ssse3 +SSSE3_PIXELS_XY2 put, 6 +SSSE3_PIXELS_XY2 avg, 7 diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index cda16dc722..42e33416eb 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -95,6 +95,15 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); +void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); +void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + #define avg_pixels8_mmx ff_avg_pixels8_mmx #define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx #define avg_pixels16_mmx ff_avg_pixels16_mmx @@ -307,6 +316,16 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) #endif /* HAVE_SSE2_EXTERNAL */ } +static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags) +{ +#if HAVE_SSSE3_EXTERNAL + c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3; + c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; + c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; + c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; +#endif +} + av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) { int cpu_flags = av_get_cpu_flags(); @@ -322,4 +341,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) if (EXTERNAL_SSE2(cpu_flags)) hpeldsp_init_sse2(c, flags, cpu_flags); + + if (EXTERNAL_SSSE3(cpu_flags)) + hpeldsp_init_ssse3(c, flags, cpu_flags); } |