aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-05-22 23:47:06 +0200
committerMichael Niedermayer <michaelni@gmx.at>2014-05-24 15:15:56 +0200
commit81aa0f4604f98da692f2689c84968f90354a92ea (patch)
tree243197f645e40a7dd5219df7992e4f8452fc73a9 /libavcodec/x86
parent726316240bcc41cef6053dd6d1e46a3c57328498 (diff)
downloadffmpeg-81aa0f4604f98da692f2689c84968f90354a92ea.tar.gz
x86: hpeldsp: implement SSSE3 version of _xy2
Loading pb_1 rather than pw_8192 was benchmarked to be more efficient. Loading of the 2 yields no advantage. Loading of one saves ~11 cycles. decicycles count: put8: 3223(mmx) -> 2387 avg8: 2863(mmxext) -> 2125 put16: 4356(sse2) -> 3553 avg16: 4481(sse2) -> 3513 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/hpeldsp.asm70
-rw-r--r--libavcodec/x86/hpeldsp_init.c22
2 files changed, 92 insertions, 0 deletions
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 4af423aee5..76e4632cbc 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -30,6 +30,9 @@
SECTION_RODATA
cextern pb_1
cextern pw_2
+pw_8192: times 8 dw (1<<13)
+pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
SECTION_TEXT
@@ -635,3 +638,70 @@ SET_PIXELS_XY2 avg
INIT_XMM sse2
SET_PIXELS_XY2 put
SET_PIXELS_XY2 avg
+
+%macro SSSE3_PIXELS_XY2 1-2
+%if %0 == 2 ; sse2
+cglobal %1_pixels16_xy2, 4,5,%2
+ mova m4, [pb_interleave16]
+%else
+cglobal %1_pixels8_xy2, 4,5
+ mova m4, [pb_interleave8]
+%endif
+ mova m5, [pb_1]
+ movu m0, [r1]
+ movu m1, [r1+1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ xor r4, r4
+ add r1, r2
+.loop:
+ movu m2, [r1+r4]
+ movu m3, [r1+r4+1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ paddusw m0, m2
+ paddusw m1, m3
+ pmulhrsw m0, [pw_8192]
+ pmulhrsw m1, [pw_8192]
+%ifidn %1, avg
+ mova m6, [r0+r4]
+ packuswb m0, m1
+ pshufb m0, m4
+ pavgb m0, m6
+%else
+ packuswb m0, m1
+ pshufb m0, m4
+%endif
+ mova [r0+r4], m0
+ add r4, r2
+
+ movu m0, [r1+r4]
+ movu m1, [r1+r4+1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ paddusw m2, m0
+ paddusw m3, m1
+ pmulhrsw m2, [pw_8192]
+ pmulhrsw m3, [pw_8192]
+%ifidn %1, avg
+ mova m6, [r0+r4]
+ packuswb m2, m3
+ pshufb m2, m4
+ pavgb m2, m6
+%else
+ packuswb m2, m3
+ pshufb m2, m4
+%endif
+ mova [r0+r4], m2
+ add r4, r2
+ sub r3d, 2
+ jnz .loop
+ REP_RET
+%endmacro
+
+INIT_MMX ssse3
+SSSE3_PIXELS_XY2 put
+SSSE3_PIXELS_XY2 avg
+INIT_XMM ssse3
+SSSE3_PIXELS_XY2 put, 6
+SSSE3_PIXELS_XY2 avg, 7
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index cda16dc722..42e33416eb 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -95,6 +95,15 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
#define avg_pixels8_mmx ff_avg_pixels8_mmx
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
#define avg_pixels16_mmx ff_avg_pixels16_mmx
@@ -307,6 +316,16 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
#endif /* HAVE_SSE2_EXTERNAL */
}
+static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags)
+{
+#if HAVE_SSSE3_EXTERNAL
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3;
+#endif
+}
+
av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
@@ -322,4 +341,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
if (EXTERNAL_SSE2(cpu_flags))
hpeldsp_init_sse2(c, flags, cpu_flags);
+
+ if (EXTERNAL_SSSE3(cpu_flags))
+ hpeldsp_init_ssse3(c, flags, cpu_flags);
}