aboutsummaryrefslogtreecommitdiffstats
path: root/libavcodec/x86/takdsp.asm
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2023-12-22 20:34:52 -0300
committerJames Almer <jamrial@gmail.com>2023-12-23 08:39:22 -0300
commit591dc3b4b87598370a7c640de1448871e0e7e8db (patch)
treecbdef59585e6f96da946d14ed695f5ed1399c081 /libavcodec/x86/takdsp.asm
parent370ce305f4f99073f823628606872f406d4abbff (diff)
downloadffmpeg-591dc3b4b87598370a7c640de1448871e0e7e8db.tar.gz
x86/takdsp: add avx2 versions of all functions
On an Intel Core i7 12700k: decorrelate_ls_c: 814.3 decorrelate_ls_sse2: 165.8 decorrelate_ls_avx2: 101.3 decorrelate_sf_c: 1602.6 decorrelate_sf_sse4: 640.1 decorrelate_sf_avx2: 324.6 decorrelate_sm_c: 1564.8 decorrelate_sm_sse2: 379.3 decorrelate_sm_avx2: 203.3 decorrelate_sr_c: 785.3 decorrelate_sr_sse2: 176.3 decorrelate_sr_avx2: 99.8 Tested-by: Lynne <dev@lynne.ee> Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavcodec/x86/takdsp.asm')
-rw-r--r--libavcodec/x86/takdsp.asm41
1 files changed, 27 insertions, 14 deletions
diff --git a/libavcodec/x86/takdsp.asm b/libavcodec/x86/takdsp.asm
index be8e1ab553..d55c5f39aa 100644
--- a/libavcodec/x86/takdsp.asm
+++ b/libavcodec/x86/takdsp.asm
@@ -28,7 +28,7 @@ pd_128: times 4 dd 128
SECTION .text
-INIT_XMM sse2
+%macro TAK_DECORRELATE 0
cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
shl lengthd, 2
add p1q, lengthq
@@ -73,10 +73,8 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
mova m1, [p2q+lengthq]
mova m3, [p1q+lengthq+mmsize]
mova m4, [p2q+lengthq+mmsize]
- mova m2, m1
- mova m5, m4
- psrad m2, 1
- psrad m5, 1
+ psrad m2, m1, 1
+ psrad m5, m4, 1
psubd m0, m2
psubd m3, m5
paddd m1, m0
@@ -88,29 +86,44 @@ cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
add lengthq, mmsize*2
jl .loop
RET
+%endmacro
-INIT_XMM sse4
+INIT_XMM sse2
+TAK_DECORRELATE
+INIT_YMM avx2
+TAK_DECORRELATE
+
+%macro TAK_DECORRELATE_SF 0
cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
shl lengthd, 2
add p1q, lengthq
add p2q, lengthq
neg lengthq
- movd m2, dshiftm
- movd m3, dfactorm
- pshufd m3, m3, 0
- mova m4, [pd_128]
+ movd xm2, dshiftm
+%if UNIX64
+ movd xm3, dfactorm
+ VPBROADCASTD m3, xm3
+%else
+ VPBROADCASTD m3, dfactorm
+%endif
+ VBROADCASTI128 m4, [pd_128]
.loop:
- mova m0, [p1q+lengthq]
mova m1, [p2q+lengthq]
- psrad m1, m2
+ psrad m1, xm2
pmulld m1, m3
paddd m1, m4
psrad m1, 8
- pslld m1, m2
- psubd m1, m0
+ pslld m1, xm2
+ psubd m1, [p1q+lengthq]
mova [p1q+lengthq], m1
add lengthq, mmsize
jl .loop
RET
+%endmacro
+
+INIT_XMM sse4
+TAK_DECORRELATE_SF
+INIT_YMM avx2
+TAK_DECORRELATE_SF