diff options
author | Rostislav Pehlivanov <atomnuker@gmail.com> | 2017-08-18 19:29:33 +0100 |
---|---|---|
committer | Rostislav Pehlivanov <atomnuker@gmail.com> | 2017-08-18 19:32:55 +0100 |
commit | 3c99523a2864af729a8576c3fffe81fb884fa0d5 (patch) | |
tree | 83fc1a6adeb06f50149c84ee125310e7b7fc7a9f | |
parent | 285c015f1077a1d357dacdccc22ab02fd4aefe0a (diff) | |
download | ffmpeg-3c99523a2864af729a8576c3fffe81fb884fa0d5.tar.gz |
opus_pvq_search: split functions into exactness and only use the exact if its faster
This splits the asm function into exact and non-exact version. The exact
version is as fast or faster on newer CPUs (which EXTERNAL_AVX_FAST describes
well) whilst the non-exact version is faster than the exact on older CPUs.
Also fixes yasm compilation which doesn't accept !cpuflags(avx) syntax.
Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com>
-rw-r--r-- | libavcodec/x86/opus_dsp_init.c | 14 | ||||
-rw-r--r-- | libavcodec/x86/opus_pvq_search.asm | 34 |
2 files changed, 28 insertions, 20 deletions
diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c index c51f786ee8..a9f8a96159 100644 --- a/libavcodec/x86/opus_dsp_init.c +++ b/libavcodec/x86/opus_dsp_init.c @@ -24,9 +24,9 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/opus_pvq.h" -extern float ff_pvq_search_sse2(float *X, int *y, int K, int N); -extern float ff_pvq_search_sse4(float *X, int *y, int K, int N); -extern float ff_pvq_search_avx (float *X, int *y, int K, int N); +extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N); +extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N); +extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N); av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) { @@ -34,12 +34,12 @@ av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) #if CONFIG_OPUS_ENCODER if (EXTERNAL_SSE2(cpu_flags)) - s->pvq_search = ff_pvq_search_sse2; + s->pvq_search = ff_pvq_search_approx_sse2; if (EXTERNAL_SSE4(cpu_flags)) - s->pvq_search = ff_pvq_search_sse4; + s->pvq_search = ff_pvq_search_approx_sse4; - if (EXTERNAL_AVX(cpu_flags)) - s->pvq_search = ff_pvq_search_avx; + if (EXTERNAL_AVX_FAST(cpu_flags)) + s->pvq_search = ff_pvq_search_exact_avx; #endif } diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm index 2f4864c95c..8cf040465d 100644 --- a/libavcodec/x86/opus_pvq_search.asm +++ b/libavcodec/x86/opus_pvq_search.asm @@ -82,7 +82,7 @@ SECTION .text %endif %endmacro -%macro PULSES_SEARCH 1 +%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation ; m6 Syy_norm ; m7 Sxy_norm addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 @@ -96,7 +96,7 @@ align 16 movaps m4, [tmpY + r4] ; y[i] movaps m5, [tmpX + r4] ; X[i] -%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but fast 1/sqrt +%if %2 xorps m0, m0 cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) %endif @@ -104,7 +104,7 @@ align 16 addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm -%if !cpuflag(avx) +%if %2 andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. %endif @@ -119,7 +119,7 @@ align 16 andps m5, m0 ; (0<y)?m5:0 %endif -%if !cpuflag(avx) +%if %2 rsqrtps m4, m4 mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) %else @@ -211,8 +211,13 @@ align 16 ; uint32 K - Number of pulses to have after quantizations. ; uint32 N - Number of vector elements. Must be 0 < N < 256 ; -%macro PVQ_FAST_SEARCH 0 -cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N +%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation +%if %1 +cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N +%else +cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N +%endif + %define tmpX rsp %define tmpY outYq @@ -255,7 +260,7 @@ align 16 jz %%zero_input ; if (Sx==0) goto zero_input cvtsi2ss xm0, dword Kd ; m0 = K -%if !cpuflag(avx) +%if %1 rcpss xm1, xm1 ; m1 = approx(1/Sx) mulss xm0, xm1 ; m0 = K*(1/Sx) %else @@ -308,7 +313,7 @@ align 16 align 16 ; K - pulses > 0 %%add_pulses_loop: - PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm + PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm sub Kd, 1 jnz %%add_pulses_loop @@ -320,7 +325,7 @@ align 16 ; K - pulses > 0 align 16 %%remove_pulses_loop: - PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm + PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm add Kd, 1 jnz %%remove_pulses_loop @@ -367,12 +372,15 @@ align 16 jmp %%return %endmacro - +; if 1, use a float op that give half precision but execute for around 3 cycles. +; On Skylake & Ryzen the division is much faster (around 11c/3), +; that makes the full precision code about 2% slower. +; Opus also does use rsqrt approximation in their intrinsics code. INIT_XMM sse2 -PVQ_FAST_SEARCH +PVQ_FAST_SEARCH 1 INIT_XMM sse4 -PVQ_FAST_SEARCH +PVQ_FAST_SEARCH 1 INIT_XMM avx -PVQ_FAST_SEARCH +PVQ_FAST_SEARCH 0 |