aboutsummaryrefslogtreecommitdiffstats
path: root/libavutil/x86/tx_float.asm
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-09-19 04:13:04 +0200
committerLynne <dev@lynne.ee>2022-09-19 06:01:04 +0200
commit892548e6a1a514fc23c5bb42e549b1a0bb604b6a (patch)
tree5471d819eac137977c19346b3cdf9bcc43bb3505 /libavutil/x86/tx_float.asm
parentaf42bb3d61c82da0c82631b07b329a280ae83d17 (diff)
downloadffmpeg-892548e6a1a514fc23c5bb42e549b1a0bb604b6a.tar.gz
x86/tx_float: fully support 128bit regs in LOAD64_LUT
The gather path didn't support 128bit registers. It's not faster on Zen 3, but it's here for completeness.
Diffstat (limited to 'libavutil/x86/tx_float.asm')
-rw-r--r--libavutil/x86/tx_float.asm10
1 files changed, 5 insertions, 5 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 3b3e26ebcb..b644db49be 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -107,19 +107,19 @@ SECTION .text
; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
%macro LOAD64_LUT 5-7
%if %0 > 6 && cpuflag(avx2)
- pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
- movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction
- vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args
+ pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
+ movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction
+ vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args
%else
mov %5d, [%3 + %4 + 0]
movsd xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
mov %5d, [%3 + %4 + 8]
movsd xmm%6, [%2 + %5q*8]
%endif
mov %5d, [%3 + %4 + 4]
movhps xmm%1, [%2 + %5q*8]
-%if mmsize == 32
+%if sizeof%1 > 16 && %0 > 5
mov %5d, [%3 + %4 + 12]
movhps xmm%6, [%2 + %5q*8]
vinsertf128 %1, %1, xmm%6, 1