diff options
author | James Almer <jamrial@gmail.com> | 2014-08-20 19:36:29 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2014-08-21 15:01:33 -0300 |
commit | 54ca4dd43bdc8658b7304d9309cdb096c8e8a394 (patch) | |
tree | 087d3f24d4603982f1fc13c67162a89db1ebdc45 | |
parent | 4a5cc34b46a8bf8d47ec907383be83b6153b9f69 (diff) | |
download | ffmpeg-54ca4dd43bdc8658b7304d9309cdb096c8e8a394.tar.gz |
x86/hevc_res_add: refactor ff_hevc_transform_add{16,32}_8
* Reduced xmm register count to 7 (As such they are now enabled for x86_32).
* Removed four movdqa (affects the sse2 version only).
* pxor is now used to clear m0 only once.
~5% faster.
Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/hevc_res_add.asm | 122 | ||||
-rw-r--r-- | libavcodec/x86/hevcdsp_init.c | 10 |
2 files changed, 51 insertions, 81 deletions
diff --git a/libavcodec/x86/hevc_res_add.asm b/libavcodec/x86/hevc_res_add.asm index feea50c67b..7238fb39c7 100644 --- a/libavcodec/x86/hevc_res_add.asm +++ b/libavcodec/x86/hevc_res_add.asm @@ -88,71 +88,41 @@ cglobal hevc_transform_add4_8, 3, 4, 6 movhps [r0+r3 ], m1 %endmacro -%macro TR_ADD_INIT_SSE_8 0 - pxor m0, m0 - - mova m4, [r1] - mova m1, [r1+16] - psubw m2, m0, m1 - psubw m5, m0, m4 - packuswb m4, m1 - packuswb m5, m2 - - mova m6, [r1+32] - mova m1, [r1+48] - psubw m2, m0, m1 - psubw m7, m0, m6 - packuswb m6, m1 - packuswb m7, m2 - - mova m8, [r1+64] - mova m1, [r1+80] - psubw m2, m0, m1 - psubw m9, m0, m8 - packuswb m8, m1 - packuswb m9, m2 - - mova m10, [r1+96] - mova m1, [r1+112] - psubw m2, m0, m1 - psubw m11, m0, m10 - packuswb m10, m1 - packuswb m11, m2 -%endmacro - - -%macro TR_ADD_SSE_16_8 0 - TR_ADD_INIT_SSE_8 - - paddusb m0, m4, [r0 ] - paddusb m1, m6, [r0+r2 ] - paddusb m2, m8, [r0+r2*2] - paddusb m3, m10,[r0+r3 ] - psubusb m0, m5 - psubusb m1, m7 - psubusb m2, m9 - psubusb m3, m11 - mova [r0 ], m0 - mova [r0+r2 ], m1 - mova [r0+2*r2], m2 - mova [r0+r3 ], m3 -%endmacro - -%macro TR_ADD_SSE_32_8 0 - TR_ADD_INIT_SSE_8 - - paddusb m0, m4, [r0 ] - paddusb m1, m6, [r0+16 ] - paddusb m2, m8, [r0+r2 ] - paddusb m3, m10,[r0+r2+16] - psubusb m0, m5 - psubusb m1, m7 - psubusb m2, m9 - psubusb m3, m11 - mova [r0 ], m0 - mova [r0+16 ], m1 - mova [r0+r2 ], m2 - mova [r0+r2+16], m3 +%macro TR_ADD_SSE_16_32_8 3 + mova m2, [r1+%1 ] + mova m6, [r1+%1+16] +%if cpuflag(avx) + psubw m1, m0, m2 + psubw m5, m0, m6 +%else + mova m1, m0 + mova m5, m0 + psubw m1, m2 + psubw m5, m6 +%endif + packuswb m2, m6 + packuswb m1, m5 + + mova m4, [r1+%1+32] + mova m6, [r1+%1+48] +%if cpuflag(avx) + psubw m3, m0, m4 + psubw m5, m0, m6 +%else + mova m3, m0 + mova m5, m0 + psubw m3, m4 + psubw m5, m6 +%endif + packuswb m4, m6 + packuswb m3, m5 + + paddusb m2, [%2] + paddusb m4, [%3] + psubusb m2, m1 + psubusb m4, m3 + mova [%2], m2 + mova [%3], m4 %endmacro @@ -166,30 +136,32 @@ cglobal hevc_transform_add8_8, 3, 4, 8 TR_ADD_SSE_8_8 RET -%if ARCH_X86_64 ; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) -cglobal hevc_transform_add16_8, 3, 4, 12 +cglobal hevc_transform_add16_8, 3, 4, 7 + pxor m0, m0 lea r3, [r2*3] - TR_ADD_SSE_16_8 + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 %rep 3 add r1, 128 lea r0, [r0+r2*4] - TR_ADD_SSE_16_8 + TR_ADD_SSE_16_32_8 0, r0, r0+r2 + TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 %endrep RET ; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) -cglobal hevc_transform_add32_8, 3, 4, 12 - - TR_ADD_SSE_32_8 +cglobal hevc_transform_add32_8, 3, 4, 7 + pxor m0, m0 + TR_ADD_SSE_16_32_8 0, r0, r0+16 + TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 %rep 15 add r1, 128 lea r0, [r0+r2*2] - TR_ADD_SSE_32_8 + TR_ADD_SSE_16_32_8 0, r0, r0+16 + TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 %endrep RET - -%endif ;ARCH_X86_64 %endmacro INIT_XMM sse2 diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index f6f0a4bddd..07091589be 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -477,15 +477,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2; - - c->transform_add[2] = ff_hevc_transform_add16_8_sse2; - c->transform_add[3] = ff_hevc_transform_add32_8_sse2; } c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2; c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2; c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2; c->transform_add[1] = ff_hevc_transform_add8_8_sse2; + c->transform_add[2] = ff_hevc_transform_add16_8_sse2; + c->transform_add[3] = ff_hevc_transform_add32_8_sse2; } if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; @@ -509,11 +508,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; - - c->transform_add[2] = ff_hevc_transform_add16_8_avx; - c->transform_add[3] = ff_hevc_transform_add32_8_avx; } c->transform_add[1] = ff_hevc_transform_add8_8_avx; + c->transform_add[2] = ff_hevc_transform_add16_8_avx; + c->transform_add[3] = ff_hevc_transform_add32_8_avx; } if (EXTERNAL_AVX2(cpu_flags)) { c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; |