diff options
author | James Almer <jamrial@gmail.com> | 2014-08-31 23:43:02 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-09-04 20:21:29 -0300 |
commit | c3d2426cca940b519ac654eb36ba04d0acc86ca9 (patch) | |
tree | ca75ece6fcc704413a39231700cbdd4eccbbbcc3 | |
parent | 467a55a4ee54c4ff17b503a5ea5e5c054ab23e9b (diff) | |
download | ffmpeg-c3d2426cca940b519ac654eb36ba04d0acc86ca9.tar.gz |
x86/hevc_res_add: add ff_hevc_transform_add32_8_avx2
~20% faster than AVX.
Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/hevc_res_add.asm | 31 | ||||
-rw-r--r-- | libavcodec/x86/hevcdsp.h | 2 | ||||
-rw-r--r-- | libavcodec/x86/hevcdsp_init.c | 2 |
3 files changed, 31 insertions, 4 deletions
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm index 7238fb39c7..488c5b7190 100644 --- a/libavcodec/x86/hevc_res_add.asm +++ b/libavcodec/x86/hevc_res_add.asm @@ -89,8 +89,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6 %endmacro %macro TR_ADD_SSE_16_32_8 3 - mova m2, [r1+%1 ] - mova m6, [r1+%1+16] + mova xm2, [r1+%1 ] + mova xm6, [r1+%1+16] +%if cpuflag(avx2) + vinserti128 m2, m2, [r1+%1+32], 1 + vinserti128 m6, m6, [r1+%1+48], 1 +%endif %if cpuflag(avx) psubw m1, m0, m2 psubw m5, m0, m6 @@ -103,8 +107,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6 packuswb m2, m6 packuswb m1, m5 - mova m4, [r1+%1+32] - mova m6, [r1+%1+48] + mova xm4, [r1+%1+mmsize*2 ] + mova xm6, [r1+%1+mmsize*2+16] +%if cpuflag(avx2) + vinserti128 m4, m4, [r1+%1+96 ], 1 + vinserti128 m6, m6, [r1+%1+112], 1 +%endif %if cpuflag(avx) psubw m3, m0, m4 psubw m5, m0, m6 @@ -169,6 +177,21 @@ TRANSFORM_ADD_8 INIT_XMM avx TRANSFORM_ADD_8 +INIT_YMM avx2 +; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) +cglobal hevc_transform_add32_8, 3, 4, 7 + pxor m0, m0 + lea r3, [r2*3] + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 +%rep 7 + add r1, 256 + lea r0, [r0+r2*4] + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 +%endrep + RET + ;----------------------------------------------------------------------------- ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index 839e05269a..8dea1428f0 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -143,6 +143,8 @@ void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); +void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); + void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 6bcced6ffc..eaa97e1434 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -555,6 +555,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; + + c->transform_add[3] = ff_hevc_transform_add32_8_avx2; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { |