diff options
author | Lynne <dev@lynne.ee> | 2022-09-19 04:13:04 +0200 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-09-19 06:01:04 +0200 |
commit | 892548e6a1a514fc23c5bb42e549b1a0bb604b6a (patch) | |
tree | 5471d819eac137977c19346b3cdf9bcc43bb3505 /libavutil | |
parent | af42bb3d61c82da0c82631b07b329a280ae83d17 (diff) | |
download | ffmpeg-892548e6a1a514fc23c5bb42e549b1a0bb604b6a.tar.gz |
x86/tx_float: fully support 128bit regs in LOAD64_LUT
The gather path didn't support 128bit registers.
It's not faster on Zen 3, but it's here for completeness.
Diffstat (limited to 'libavutil')
-rw-r--r-- | libavutil/x86/tx_float.asm | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index 3b3e26ebcb..b644db49be 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -107,19 +107,19 @@ SECTION .text ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) %macro LOAD64_LUT 5-7 %if %0 > 6 && cpuflag(avx2) - pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 - movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction - vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args + pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 + movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction + vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args %else mov %5d, [%3 + %4 + 0] movsd xmm%1, [%2 + %5q*8] -%if mmsize == 32 +%if sizeof%1 > 16 && %0 > 5 mov %5d, [%3 + %4 + 8] movsd xmm%6, [%2 + %5q*8] %endif mov %5d, [%3 + %4 + 4] movhps xmm%1, [%2 + %5q*8] -%if mmsize == 32 +%if sizeof%1 > 16 && %0 > 5 mov %5d, [%3 + %4 + 12] movhps xmm%6, [%2 + %5q*8] vinsertf128 %1, %1, xmm%6, 1 |