diff options
author | Rostislav Pehlivanov <atomnuker@gmail.com> | 2017-08-18 17:28:40 +0100 |
---|---|---|
committer | Rostislav Pehlivanov <atomnuker@gmail.com> | 2017-08-18 17:30:41 +0100 |
commit | f386dd70acdc81d42d6bcb885d2889634cdf45b7 (patch) | |
tree | b6b29e9200dcfcfea9921cf93b678598ed529309 | |
parent | 8e53cd1fab86b8774f0b5b0f186d8e0cd9c1895b (diff) | |
download | ffmpeg-f386dd70acdc81d42d6bcb885d2889634cdf45b7.tar.gz |
opus_pvq_search: only use rsqrtps approximation on CPUs with avx
Makes the search produce idential results with the C version.
Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com>
-rw-r--r-- | libavcodec/x86/opus_pvq_search.asm | 18 |
1 files changed, 6 insertions, 12 deletions
diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm index beb6cbcc9b..2f4864c95c 100644 --- a/libavcodec/x86/opus_pvq_search.asm +++ b/libavcodec/x86/opus_pvq_search.asm @@ -28,12 +28,6 @@ ALIGNMODE p6 %endif -; Use float op that give half precision but execute for around 3 cycles. -; On Skylake & Ryzen the division is much faster (around 11c/3), -; that makes the full precision code about 2% slower. -; Opus also does use rsqrt approximation in their intrinsics code. -%define USE_APPROXIMATION 1 - SECTION_RODATA 64 const_float_abs_mask: times 8 dd 0x7fffffff @@ -102,17 +96,17 @@ align 16 movaps m4, [tmpY + r4] ; y[i] movaps m5, [tmpX + r4] ; X[i] - %if USE_APPROXIMATION == 1 +%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but fast 1/sqrt xorps m0, m0 cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) - %endif +%endif addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm - %if USE_APPROXIMATION == 1 +%if !cpuflag(avx) andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. - %endif +%endif %else movaps m5, [tmpY + r4] ; m5 = y[i] @@ -125,7 +119,7 @@ align 16 andps m5, m0 ; (0<y)?m5:0 %endif -%if USE_APPROXIMATION == 1 +%if !cpuflag(avx) rsqrtps m4, m4 mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) %else @@ -261,7 +255,7 @@ align 16 jz %%zero_input ; if (Sx==0) goto zero_input cvtsi2ss xm0, dword Kd ; m0 = K -%if USE_APPROXIMATION == 1 +%if !cpuflag(avx) rcpss xm1, xm1 ; m1 = approx(1/Sx) mulss xm0, xm1 ; m0 = K*(1/Sx) %else |