aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2015-02-12 03:11:37 -0300
committerJames Almer <jamrial@gmail.com>2015-02-12 13:21:58 -0300
commit1679d68dbfae8414e1e805823758c02c17188dd4 (patch)
treee3080437c52c83a058105c33700791cefeceb5d0
parent14b44c16142f9863ac0c853e3a79ae3ec3c6cf6e (diff)
downloadffmpeg-1679d68dbfae8414e1e805823758c02c17188dd4.tar.gz
x86/hevc_mc: optimize AVX2 mc functions
Before 40766 decicycles in ff_hevc_put_hevc_qpel_h64_8_avx2, 8192 runs, 0 skips After 37975 decicycles in ff_hevc_put_hevc_qpel_h64_8_avx2, 8192 runs, 0 skips Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r--libavcodec/x86/hevc_mc.asm32
1 files changed, 12 insertions, 20 deletions
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 027daa8735..9a4c9ca209 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -489,14 +489,12 @@ QPEL_TABLE 10, 8, w, avx2
%if %1 == 8
%if cpuflag(avx2) && (%0 == 5)
%if %2 > 16
- vextracti128 xm10, m0, 1
- vinserti128 m10, m1, xm10, 0
+ vperm2i128 m10, m0, m1, q0301
%endif
vinserti128 m0, m0, xm1, 1
mova m1, m10
%if %2 > 16
- vextracti128 xm10, m2, 1
- vinserti128 m10, m3, xm10, 0
+ vperm2i128 m10, m2, m3, q0301
%endif
vinserti128 m2, m2, xm3, 1
mova m3, m10
@@ -583,26 +581,22 @@ QPEL_TABLE 10, 8, w, avx2
%if %2 == 8
%if cpuflag(avx2) && (%0 == 3)
- vextracti128 xm10, m0, 1
- vinserti128 m10, m1, xm10, 0
+ vperm2i128 m10, m0, m1, q0301
vinserti128 m0, m0, xm1, 1
- mova m1, m10
+ SWAP 1, 10
- vextracti128 xm10, m2, 1
- vinserti128 m10, m3, xm10, 0
+ vperm2i128 m10, m2, m3, q0301
vinserti128 m2, m2, xm3, 1
- mova m3, m10
+ SWAP 3, 10
- vextracti128 xm10, m4, 1
- vinserti128 m10, m5, xm10, 0
+ vperm2i128 m10, m4, m5, q0301
vinserti128 m4, m4, xm5, 1
- mova m5, m10
+ SWAP 5, 10
- vextracti128 xm10, m6, 1
- vinserti128 m10, m7, xm10, 0
+ vperm2i128 m10, m6, m7, q0301
vinserti128 m6, m6, xm7, 1
- mova m7, m10
+ SWAP 7, 10
%endif
pmaddubsw m0, m12 ;x1*c1+x2*c2
@@ -889,8 +883,7 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
%if cpuflag(avx2)
vinserti128 m2, m0, xm4, 1
- vextracti128 xm3, m0, 1
- vinserti128 m3, m4, xm3, 0
+ vperm2i128 m3, m0, m4, q0301
PEL_10STORE%1 dstq, m2, m3
%else
PEL_10STORE%1 dstq, m0, m4
@@ -1021,8 +1014,7 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
SIMPLE_BILOAD %1, src2q, m8, m3
%if cpuflag(avx2)
vinserti128 m1, m8, xm3, 1
- vextracti128 xm8, m8, 1
- vinserti128 m2, m3, xm8, 0
+ vperm2i128 m2, m8, m3, q0301
BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2]
%else
BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2]