aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/h264_idct.asm
diff options
context:
space:
mode:
authorJames Darnley <jdarnley@obe.tv>2017-03-16 14:59:48 +0100
committerJames Darnley <jdarnley@obe.tv>2017-05-15 15:00:17 +0200
commitf61d454ca13f277b6ab7bbc9ebf7d26ce6d67ec6 (patch)
treef76b67f387dee2a99bc7f794a197e02edd7fcff4 /libavcodec/x86/h264_idct.asm
parentb5325c6711a6789e6219f6392fd9158cb2c0fcd7 (diff)
downloadffmpeg-f61d454ca13f277b6ab7bbc9ebf7d26ce6d67ec6.tar.gz
avcodec/h264: add avx 8-bit h264_idct_add
Haswell: - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext Skylake-U: - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
Diffstat (limited to 'libavcodec/x86/h264_idct.asm')
-rw-r--r--libavcodec/x86/h264_idct.asm33
1 files changed, 32 insertions, 1 deletions
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index be15afb766..81fe793600 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -65,7 +65,15 @@ SECTION .text
IDCT4_1D w, 0, 1, 2, 3, 4, 5
mova m6, [pw_32]
- TRANSPOSE4x4W 0, 1, 2, 3, 4
+ %if mmsize == 8
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ %else
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ SBUTTERFLY dq, 0, 2, 4
+ MOVHL m1, m0
+ MOVHL m3, m2
+ %endif
paddw m0, m6
IDCT4_1D w, 0, 1, 2, 3, 4, 5
pxor m7, m7
@@ -1131,3 +1139,26 @@ INIT_MMX mmx
IDCT_DC_DEQUANT 0
INIT_MMX sse2
IDCT_DC_DEQUANT 7
+
+INIT_XMM avx
+
+; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+ movd %3, [%7]
+ movd %4, [%7+%8]
+ psraw %1, %6
+ psraw %2, %6
+ punpcklbw %3, %5
+ punpcklbw %4, %5
+ paddw %3, %1
+ paddw %4, %2
+ packuswb %3, %5
+ packuswb %4, %5
+ movd [%7], %3
+ movd [%7+%8], %4
+%endmacro
+
+cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
+ movsxdifnidn stride_q, stride_d
+ IDCT4_ADD dst_q, block_q, stride_q
+RET