diff options
author | James Darnley <jdarnley@obe.tv> | 2017-03-16 14:59:48 +0100 |
---|---|---|
committer | James Darnley <jdarnley@obe.tv> | 2017-05-15 15:00:17 +0200 |
commit | f61d454ca13f277b6ab7bbc9ebf7d26ce6d67ec6 (patch) | |
tree | f76b67f387dee2a99bc7f794a197e02edd7fcff4 /libavcodec/x86/h264_idct.asm | |
parent | b5325c6711a6789e6219f6392fd9158cb2c0fcd7 (diff) | |
download | ffmpeg-f61d454ca13f277b6ab7bbc9ebf7d26ce6d67ec6.tar.gz |
avcodec/h264: add avx 8-bit h264_idct_add
Haswell:
- 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext
Skylake-U:
- 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
Diffstat (limited to 'libavcodec/x86/h264_idct.asm')
-rw-r--r-- | libavcodec/x86/h264_idct.asm | 33 |
1 files changed, 32 insertions, 1 deletions
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index be15afb766..81fe793600 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -65,7 +65,15 @@ SECTION .text IDCT4_1D w, 0, 1, 2, 3, 4, 5 mova m6, [pw_32] - TRANSPOSE4x4W 0, 1, 2, 3, 4 + %if mmsize == 8 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + %else + punpcklwd m0, m1 + punpcklwd m2, m3 + SBUTTERFLY dq, 0, 2, 4 + MOVHL m1, m0 + MOVHL m3, m2 + %endif paddw m0, m6 IDCT4_1D w, 0, 1, 2, 3, 4, 5 pxor m7, m7 @@ -1131,3 +1139,26 @@ INIT_MMX mmx IDCT_DC_DEQUANT 0 INIT_MMX sse2 IDCT_DC_DEQUANT 7 + +INIT_XMM avx + +; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movd %3, [%7] + movd %4, [%7+%8] + psraw %1, %6 + psraw %2, %6 + punpcklbw %3, %5 + punpcklbw %4, %5 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movd [%7], %3 + movd [%7+%8], %4 +%endmacro + +cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ + movsxdifnidn stride_q, stride_d + IDCT4_ADD dst_q, block_q, stride_q +RET |