aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2016-07-08 17:51:56 -0400
committerRonald S. Bultje <rsbultje@gmail.com>2016-07-11 10:14:58 -0400
commitf0a2b6249bb2426befa4c03247361268e45b13af (patch)
treef86bf79a94cef01c09081900d6effe66811eaa1f /libavutil
parenta2550e7d2e4c1e2f1fd262985a1d789240f7e839 (diff)
downloadffmpeg-f0a2b6249bb2426befa4c03247361268e45b13af.tar.gz
vp9: add 16x16 idct avx2 (8-bit).
checkasm --bench, 10k runs, for *_add_${bpc}_${sub_idct}_${opt}, shows that it's about 1.65x as fast as the AVX version for the full IDCT, and similar speedups for the sub-IDCTs: nop: 24.6 vp9_inv_dct_dct_16x16_add_8_1_c: 6444.8 vp9_inv_dct_dct_16x16_add_8_1_sse2: 638.6 vp9_inv_dct_dct_16x16_add_8_1_ssse3: 484.4 vp9_inv_dct_dct_16x16_add_8_1_avx: 661.2 vp9_inv_dct_dct_16x16_add_8_1_avx2: 311.5 vp9_inv_dct_dct_16x16_add_8_2_c: 6665.7 vp9_inv_dct_dct_16x16_add_8_2_sse2: 646.9 vp9_inv_dct_dct_16x16_add_8_2_ssse3: 455.2 vp9_inv_dct_dct_16x16_add_8_2_avx: 521.9 vp9_inv_dct_dct_16x16_add_8_2_avx2: 304.3 vp9_inv_dct_dct_16x16_add_8_4_c: 7022.7 vp9_inv_dct_dct_16x16_add_8_4_sse2: 647.4 vp9_inv_dct_dct_16x16_add_8_4_ssse3: 467.1 vp9_inv_dct_dct_16x16_add_8_4_avx: 446.1 vp9_inv_dct_dct_16x16_add_8_4_avx2: 297.0 vp9_inv_dct_dct_16x16_add_8_8_c: 6800.4 vp9_inv_dct_dct_16x16_add_8_8_sse2: 598.6 vp9_inv_dct_dct_16x16_add_8_8_ssse3: 465.7 vp9_inv_dct_dct_16x16_add_8_8_avx: 440.9 vp9_inv_dct_dct_16x16_add_8_8_avx2: 290.2 vp9_inv_dct_dct_16x16_add_8_16_c: 6626.6 vp9_inv_dct_dct_16x16_add_8_16_sse2: 599.5 vp9_inv_dct_dct_16x16_add_8_16_ssse3: 475.0 vp9_inv_dct_dct_16x16_add_8_16_avx: 469.9 vp9_inv_dct_dct_16x16_add_8_16_avx2: 286.4
Diffstat (limited to 'libavutil')
-rw-r--r--libavutil/x86/x86util.asm69
1 files changed, 68 insertions, 1 deletions
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index e7493f4883..44ed750ae5 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -30,7 +30,10 @@
%include "libavutil/x86/x86inc.asm"
%macro SBUTTERFLY 4
-%if avx_enabled == 0
+%ifidn %1, dqqq
+ vperm2i128 m%4, m%2, m%3, q0301
+ vinserti128 m%2, m%2, xm%3, 1
+%elif avx_enabled == 0
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
@@ -193,6 +196,70 @@
%endif
%endmacro
+%macro TRANSPOSE16x16W 18-19
+; in: m0..m15, unless %19 in which case m6 is in %17
+; out: m0..m15, unless %19 in which case m4 is in %18
+; spills into %17 and %18
+%if %0 < 19
+ mova %17, m%7
+%endif
+
+ SBUTTERFLY dqqq, %1, %9, %7
+ SBUTTERFLY dqqq, %2, %10, %7
+ SBUTTERFLY dqqq, %3, %11, %7
+ SBUTTERFLY dqqq, %4, %12, %7
+ SBUTTERFLY dqqq, %5, %13, %7
+ SBUTTERFLY dqqq, %6, %14, %7
+ mova %18, m%14
+ mova m%7, %17
+ SBUTTERFLY dqqq, %7, %15, %14
+ SBUTTERFLY dqqq, %8, %16, %14
+
+ SBUTTERFLY wd, %1, %2, %14
+ SBUTTERFLY wd, %3, %4, %14
+ SBUTTERFLY wd, %5, %6, %14
+ SBUTTERFLY wd, %7, %8, %14
+ SBUTTERFLY wd, %9, %10, %14
+ SBUTTERFLY wd, %11, %12, %14
+ mova %17, m%12
+ mova m%14, %18
+ SBUTTERFLY wd, %13, %14, %12
+ SBUTTERFLY wd, %15, %16, %12
+
+ SBUTTERFLY dq, %1, %3, %12
+ SBUTTERFLY dq, %2, %4, %12
+ SBUTTERFLY dq, %5, %7, %12
+ SBUTTERFLY dq, %6, %8, %12
+ SBUTTERFLY dq, %9, %11, %12
+ mova %18, m%11
+ mova m%12, %17
+ SBUTTERFLY dq, %10, %12, %11
+ SBUTTERFLY dq, %13, %15, %11
+ SBUTTERFLY dq, %14, %16, %11
+
+ SBUTTERFLY qdq, %1, %5, %11
+ SBUTTERFLY qdq, %2, %6, %11
+ SBUTTERFLY qdq, %3, %7, %11
+ SBUTTERFLY qdq, %4, %8, %11
+
+ SWAP %2, %5
+ SWAP %4, %7
+
+ SBUTTERFLY qdq, %9, %13, %11
+ SBUTTERFLY qdq, %10, %14, %11
+ mova m%11, %18
+ mova %18, m%5
+ SBUTTERFLY qdq, %11, %15, %5
+ SBUTTERFLY qdq, %12, %16, %5
+
+%if %0 < 19
+ mova m%5, %18
+%endif
+
+ SWAP %10, %13
+ SWAP %12, %15
+%endmacro
+
; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
%macro PABSW 2
%if cpuflag(ssse3)