aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-08-31 23:43:02 -0300
committerJames Almer <jamrial@gmail.com>2014-09-04 20:21:29 -0300
commitc3d2426cca940b519ac654eb36ba04d0acc86ca9 (patch)
treeca75ece6fcc704413a39231700cbdd4eccbbbcc3
parent467a55a4ee54c4ff17b503a5ea5e5c054ab23e9b (diff)
downloadffmpeg-c3d2426cca940b519ac654eb36ba04d0acc86ca9.tar.gz
x86/hevc_res_add: add ff_hevc_transform_add32_8_avx2
~20% faster than AVX. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r--libavcodec/x86/hevc_res_add.asm31
-rw-r--r--libavcodec/x86/hevcdsp.h2
-rw-r--r--libavcodec/x86/hevcdsp_init.c2
3 files changed, 31 insertions, 4 deletions
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm
index 7238fb39c7..488c5b7190 100644
--- a/libavcodec/x86/hevc_res_add.asm
+++ b/libavcodec/x86/hevc_res_add.asm
@@ -89,8 +89,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
%endmacro
%macro TR_ADD_SSE_16_32_8 3
- mova m2, [r1+%1 ]
- mova m6, [r1+%1+16]
+ mova xm2, [r1+%1 ]
+ mova xm6, [r1+%1+16]
+%if cpuflag(avx2)
+ vinserti128 m2, m2, [r1+%1+32], 1
+ vinserti128 m6, m6, [r1+%1+48], 1
+%endif
%if cpuflag(avx)
psubw m1, m0, m2
psubw m5, m0, m6
@@ -103,8 +107,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
packuswb m2, m6
packuswb m1, m5
- mova m4, [r1+%1+32]
- mova m6, [r1+%1+48]
+ mova xm4, [r1+%1+mmsize*2 ]
+ mova xm6, [r1+%1+mmsize*2+16]
+%if cpuflag(avx2)
+ vinserti128 m4, m4, [r1+%1+96 ], 1
+ vinserti128 m6, m6, [r1+%1+112], 1
+%endif
%if cpuflag(avx)
psubw m3, m0, m4
psubw m5, m0, m6
@@ -169,6 +177,21 @@ TRANSFORM_ADD_8
INIT_XMM avx
TRANSFORM_ADD_8
+INIT_YMM avx2
+; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_transform_add32_8, 3, 4, 7
+ pxor m0, m0
+ lea r3, [r2*3]
+ TR_ADD_SSE_16_32_8 0, r0, r0+r2
+ TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%rep 7
+ add r1, 256
+ lea r0, [r0+r2*4]
+ TR_ADD_SSE_16_32_8 0, r0, r0+r2
+ TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
+%endrep
+ RET
+
;-----------------------------------------------------------------------------
; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 839e05269a..8dea1428f0 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -143,6 +143,8 @@ void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+
void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 6bcced6ffc..eaa97e1434 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -555,6 +555,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_AVX2(cpu_flags)) {
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
+
+ c->transform_add[3] = ff_hevc_transform_add32_8_avx2;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {