diff options
author | Ronald S. Bultje <rsbultje@gmail.com> | 2015-10-06 11:03:45 -0400 |
---|---|---|
committer | Ronald S. Bultje <rsbultje@gmail.com> | 2015-10-13 11:05:58 -0400 |
commit | 6b579cf547a75a0cbda5cb7f10eab9ca07522b0a (patch) | |
tree | 6dc311271278288af98f9d886d39684c62949355 /libavcodec/x86/vp9itxfm_template.asm | |
parent | 1c3be32533e506d66b5a8eb7b93b12d4442146fb (diff) | |
download | ffmpeg-6b579cf547a75a0cbda5cb7f10eab9ca07522b0a.tar.gz |
vp9: add 10bpp simd (mmxext/ssse3) for idct_idct_4x4.
Diffstat (limited to 'libavcodec/x86/vp9itxfm_template.asm')
-rw-r--r-- | libavcodec/x86/vp9itxfm_template.asm | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm index 43cf3aa015..f1a05a5926 100644 --- a/libavcodec/x86/vp9itxfm_template.asm +++ b/libavcodec/x86/vp9itxfm_template.asm @@ -35,3 +35,50 @@ paddw m3, m2 SWAP 3, 2, 1 %endmacro + +; (a*x + b*y + round) >> shift +%macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 + pmaddwd m%1, m%2, %4 + pmaddwd m%2, %5 + paddd m%1, %3 + paddd m%2, %3 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 + VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] + VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 +%if %0 == 7 + punpckhwd m%6, m%2, m%1 + punpcklwd m%2, m%1 + VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 +%else + punpckhwd m%8, m%4, m%3 + punpcklwd m%2, m%4, m%3 + VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 +%endif +%endmacro + +%macro VP9_IDCT4_1D_FINALIZE 0 + SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 + SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 + SWAP 0, 3, 2 ; 3102 -> 0123 +%endmacro + +%macro VP9_IDCT4_1D 0 +%if cpuflag(ssse3) + SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) + pmulhrsw m2, m6 ; m2=t0 + pmulhrsw m0, m6 ; m0=t1 +%else ; <= sse2 + VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 +%endif + VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 + VP9_IDCT4_1D_FINALIZE +%endmacro |