diff options
author | James Almer <jamrial@gmail.com> | 2022-09-20 16:02:49 -0300 |
---|---|---|
committer | James Almer <jamrial@gmail.com> | 2022-09-22 13:27:43 -0300 |
commit | 48615f0a7861fd1c24195f74856f68e06c7ca73c (patch) | |
tree | 3a9120e8558ca989f16688cdebd518bf598ad0c4 | |
parent | 2bcf86d53df64276fbe1b95d8eaf335be0a766ab (diff) | |
download | ffmpeg-48615f0a7861fd1c24195f74856f68e06c7ca73c.tar.gz |
x86/aacpsdsp: add ps_hybrid_analysis_fma3
This replace the sse3 version, which was not really faster than the sse one.
Signed-off-by: James Almer <jamrial@gmail.com>
-rw-r--r-- | libavcodec/x86/aacpsdsp.asm | 42 | ||||
-rw-r--r-- | libavcodec/x86/aacpsdsp_init.c | 6 |
2 files changed, 25 insertions, 23 deletions
diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 543d33e68d..105e1af5c5 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -403,10 +403,8 @@ HYBRID_SYNTHESIS_DEINT %macro PS_HYBRID_ANALYSIS_IN 1 movu m0, [inq+mmsize*%1] movu m1, [inq+mmsize*(5-%1)+8] - mova m3, m0 - mova m4, m1 - shufps m3, m3, q2301 - shufps m4, m4, q0123 + shufps m3, m0, m0, q2301 + shufps m4, m1, m1, q0123 shufps m1, m1, q1032 %if cpuflag(sse3) addsubps m3, m4 @@ -424,6 +422,15 @@ HYBRID_SYNTHESIS_DEINT %macro PS_HYBRID_ANALYSIS_LOOP 3 mova m2, [filterq+nq+mmsize*%3] shufps m2, m2, q2301 +%if cpuflag(fma3) +%if %3 + fmaddps m3, m2, [rsp+mmsize*%3*2], m3 + fmaddps m0, m2, [rsp+mmsize+mmsize*%3*2], m0 +%else + mulps m3, m2, [rsp] + mulps m0, m2, [rsp+mmsize] +%endif +%else ; cpuflag(sse) mova %2, [rsp+mmsize*%3*2] mova %1, [rsp+mmsize+mmsize*%3*2] mulps %2, m2 @@ -432,20 +439,21 @@ HYBRID_SYNTHESIS_DEINT addps m3, %2 addps m0, %1 %endif +%endif %endmacro %macro PS_HYBRID_ANALYSIS 0 -cglobal ps_hybrid_analysis, 5, 5, 8, 24 * 4, out, in, filter, stride, n +cglobal ps_hybrid_analysis, 5, 5, 5 + notcpuflag(fma3) * 3, 24 * 4, out, in, filter, stride, n %if cpuflag(sse3) %define MOVH movsd %else %define MOVH movlps + mova m7, [ps_p1m1p1m1] %endif shl strideq, 3 shl nd, 6 add filterq, nq neg nq - mova m7, [ps_p1m1p1m1] PS_HYBRID_ANALYSIS_IN 0 PS_HYBRID_ANALYSIS_IN 1 PS_HYBRID_ANALYSIS_IN 2 @@ -456,30 +464,22 @@ align 16 PS_HYBRID_ANALYSIS_LOOP m5, m6, 1 PS_HYBRID_ANALYSIS_LOOP m5, m6, 2 -%if cpuflag(sse3) - pshufd m3, m3, q2301 - xorps m0, m7 - hsubps m3, m0 - pshufd m1, m3, q0020 - pshufd m3, m3, q0031 - addps m1, m3 - movsd m2, [inq+6*8] -%else - mova m1, m3 - mova m2, m0 - shufps m1, m1, q2301 - shufps m2, m2, q2301 + shufps m1, m3, m3, q2301 + shufps m2, m0, m0, q2301 subps m1, m3 addps m2, m0 unpcklps m3, m1, m2 unpckhps m1, m2 addps m1, m3 movu m2, [inq+6*8] ; faster than movlps and no risk of overread -%endif movss m3, [filterq+nq+8*6] SPLATD m3 +%if cpuflag(fma3) + fmaddps m1, m2, m3, m1 +%else mulps m2, m3 addps m1, m2 +%endif MOVH [outq], m1 add outq, strideq add nq, 64 @@ -489,5 +489,5 @@ align 16 INIT_XMM sse PS_HYBRID_ANALYSIS -INIT_XMM sse3 +INIT_XMM fma3 PS_HYBRID_ANALYSIS diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c index 21f00efa24..0b0ee07db4 100644 --- a/libavcodec/x86/aacpsdsp_init.c +++ b/libavcodec/x86/aacpsdsp_init.c @@ -33,7 +33,7 @@ void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2], void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2], const float (*filter)[8][2], ptrdiff_t stride, int n); -void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2], +void ff_ps_hybrid_analysis_fma3(float (*out)[2], float (*in)[2], const float (*filter)[8][2], ptrdiff_t stride, int n); void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], @@ -64,9 +64,11 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) s->add_squares = ff_ps_add_squares_sse3; s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3; s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3; - s->hybrid_analysis = ff_ps_hybrid_analysis_sse3; } if (EXTERNAL_SSE4(cpu_flags)) { s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4; } + if (EXTERNAL_FMA3(cpu_flags)) { + s->hybrid_analysis = ff_ps_hybrid_analysis_fma3; + } } |